@vespermcp/mcp-server 1.1.3 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5,10 +5,13 @@ import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } f
5
5
  import { fileURLToPath } from "url";
6
6
  import path from "path";
7
7
  import fs from "fs";
8
+ import { spawn } from "child_process";
8
9
  import { MetadataStore } from "./metadata/store.js";
9
10
  import { VectorStore } from "./search/vector-store.js";
10
11
  import { Embedder } from "./search/embedder.js";
11
12
  import { SearchEngine } from "./search/engine.js";
13
+ import { HuggingFaceScraper } from "./metadata/scraper.js";
14
+ import { KaggleSource } from "./metadata/kaggle-source.js";
12
15
  import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
13
16
  import { JobManager } from "./jobs/manager.js";
14
17
  import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -17,6 +20,7 @@ import { DataCleaner } from "./cleaning/cleaner.js";
17
20
  import { PipelineExecutor } from "./cleaning/executor.js";
18
21
  import { DataSplitter } from "./splitting/splitter.js";
19
22
  import { DataExporter } from "./export/exporter.js";
23
+ import { DataFusionEngine } from "./fusion/engine.js";
20
24
  import { DataIngestor } from "./ingestion/ingestor.js";
21
25
  import { InstallService } from "./install/install-service.js";
22
26
  import { CacheService, MockRedisProvider } from "./cache/service.js";
@@ -24,6 +28,8 @@ import { ImageAnalyzer } from "./quality/image-analyzer.js";
24
28
  import { MediaAnalyzer } from "./quality/media-analyzer.js";
25
29
  import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
26
30
  import { ConfigManager } from "./config/config-manager.js";
31
+ import { SecureKeysManager } from "./config/secure-keys.js";
32
+ import readline from "readline";
27
33
  import os from "os";
28
34
  // Determine absolute paths relative to the compiled script
29
35
  const __filename = fileURLToPath(import.meta.url);
@@ -49,6 +55,109 @@ function logError(err, context) {
49
55
  fs.appendFileSync(errorLogPath, msg);
50
56
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
51
57
  }
58
+ const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
59
+ function printLaunchScreen() {
60
+ const screen = `
61
+ ══════════════════════════════════════════════
62
+
63
+ ██ ██ ███████ ███████ ██████ ███████ ██████
64
+ ██ ██ ██ ██ ██ ██ ██ ██ ██
65
+ ██ ██ █████ █████ ██████ █████ ██████
66
+ ██ ██ ██ ██ ██ ██ ██ ██
67
+ ████ ███████ ███████ ██ ███████ ██ ██
68
+
69
+ dataset intelligence layer
70
+ mcp-native • agent-first
71
+
72
+ ══════════════════════════════════════════════
73
+
74
+ [ core ] initializing
75
+ [ splitting ] leakage-safe
76
+ [ quality ] multimodal scan
77
+ [ fusion ] guarded
78
+ [ synth ] generation ready
79
+
80
+ status: operational
81
+ `;
82
+ console.error(screen);
83
+ }
84
+ async function runWithSpinner(label, task) {
85
+ if (!process.stderr.isTTY) {
86
+ return task();
87
+ }
88
+ let frameIndex = 0;
89
+ let timer;
90
+ let spinnerShown = false;
91
+ const delayedStart = setTimeout(() => {
92
+ spinnerShown = true;
93
+ timer = setInterval(() => {
94
+ const frame = SPINNER_FRAMES[frameIndex % SPINNER_FRAMES.length];
95
+ frameIndex += 1;
96
+ process.stderr.write(`\r${frame} ${label}`);
97
+ }, 90);
98
+ }, 1000);
99
+ try {
100
+ const result = await task();
101
+ clearTimeout(delayedStart);
102
+ if (timer)
103
+ clearInterval(timer);
104
+ if (spinnerShown)
105
+ process.stderr.write(`\r[ok] ${label} \n`);
106
+ return result;
107
+ }
108
+ catch (error) {
109
+ clearTimeout(delayedStart);
110
+ if (timer)
111
+ clearInterval(timer);
112
+ if (spinnerShown)
113
+ process.stderr.write(`\r[error] ${label} \n`);
114
+ throw error;
115
+ }
116
+ }
117
+ function extractRequestedRows(query, requirements) {
118
+ const text = `${query || ""} ${requirements || ""}`.toLowerCase();
119
+ const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
120
+ if (explicit) {
121
+ const n = Number(explicit[1].replace(/[\s,]/g, ""));
122
+ if (Number.isFinite(n) && n > 0)
123
+ return n;
124
+ }
125
+ const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
126
+ .map(m => Number(m[0]))
127
+ .filter(n => Number.isFinite(n) && n > 0);
128
+ if (allNums.length > 0)
129
+ return Math.max(...allNums);
130
+ return undefined;
131
+ }
132
+ function runPythonJson(scriptPath, args) {
133
+ const pyCmd = process.platform === "win32" ? "py" : "python";
134
+ return new Promise((resolve, reject) => {
135
+ const proc = spawn(pyCmd, [scriptPath, ...args]);
136
+ let stdout = "";
137
+ let stderr = "";
138
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
139
+ proc.stderr.on("data", (d) => (stderr += d.toString()));
140
+ proc.on("close", (code) => {
141
+ if (code !== 0) {
142
+ reject(new Error(stderr || stdout || `Python exited with ${code}`));
143
+ return;
144
+ }
145
+ try {
146
+ resolve(JSON.parse(stdout));
147
+ }
148
+ catch {
149
+ reject(new Error(`Invalid JSON from python helper: ${stdout}`));
150
+ }
151
+ });
152
+ });
153
+ }
154
+ async function countRows(filePath) {
155
+ const scriptPath = path.join(dataRoot, "python", "row_count.py");
156
+ const result = await runPythonJson(scriptPath, [filePath]);
157
+ if (!result.ok)
158
+ throw new Error(result.error || "Failed to count rows");
159
+ return Number(result.rows || 0);
160
+ }
52
161
  /**
53
162
  * Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
54
163
  */
@@ -105,6 +214,21 @@ const dataCleaner = new DataCleaner(__dirname);
105
214
  const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
106
215
  const dataSplitter = new DataSplitter(__dirname);
107
216
  const dataExporter = new DataExporter(__dirname);
217
+ const fusionEngine = new DataFusionEngine(__dirname);
218
+ const kaggleSource = new KaggleSource(__dirname);
219
+ const secureKeys = new SecureKeysManager(__dirname);
220
+ function hydrateExternalKeys() {
221
+ const keys = secureKeys.getAll();
222
+ if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
223
+ process.env.HF_TOKEN = String(keys.hf_token);
224
+ }
225
+ if (!process.env.KAGGLE_USERNAME && keys.kaggle_username) {
226
+ process.env.KAGGLE_USERNAME = String(keys.kaggle_username);
227
+ }
228
+ if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
229
+ process.env.KAGGLE_KEY = String(keys.kaggle_key);
230
+ }
231
+ }
108
232
  // CRITICAL FIX: Pass __dirname (build directory) to analyzers
109
233
  // Python scripts are in build/python/, so analyzers should look relative to build/
110
234
  // NOT relative to project root (appRoot)
@@ -117,11 +241,11 @@ const qualityOrchestrator = new QualityOrchestrator(__dirname);
117
241
  // Subscribe to job updates for real-time streaming to the UI
118
242
  jobManager.on("jobUpdated", (job) => {
119
243
  const level = job.status === "failed" ? "error" : "info";
120
- const emoji = job.status === "completed" ? "" : (job.status === "failed" ? "" : "");
244
+ const statusTag = job.status === "completed" ? "done" : (job.status === "failed" ? "failed" : "running");
121
245
  const progress = job.progress > 0 ? `[${job.progress}%]` : "";
122
246
  server.sendLoggingMessage({
123
247
  level,
124
- data: `${emoji} [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
248
+ data: `[${statusTag}] [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
125
249
  });
126
250
  });
127
251
  // IMPORTANT: Execute jobs when the manager emits them
@@ -136,7 +260,7 @@ jobManager.on("processJob", async (job, execute) => {
136
260
  console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
137
261
  const metadata = job.metadata ? JSON.parse(job.metadata) : {};
138
262
  switch (job.type) {
139
- case "prepare": return await handlePrepareJob(job.id, metadata.query);
263
+ case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
140
264
  case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
141
265
  default: throw new Error(`Unhandled job type: ${job.type}`);
142
266
  }
@@ -154,8 +278,9 @@ jobManager.on("processJob", async (job, execute) => {
154
278
  /**
155
279
  * Logic for preparing a dataset (Search + Ingest + Process)
156
280
  */
157
- async function handlePrepareJob(jobId, query) {
281
+ async function handlePrepareJob(jobId, query, requirements) {
158
282
  const update = (updates) => jobManager.updateJob(jobId, updates);
283
+ const requestedRows = extractRequestedRows(query, requirements);
159
284
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
160
285
  const results = await searchEngine.search(query, { limit: 1 });
161
286
  if (results.length === 0) {
@@ -176,9 +301,59 @@ async function handlePrepareJob(jobId, query) {
176
301
  }
177
302
  update({ progress: 30, status_text: `Starting download from ${source}...` });
178
303
  // ensureData handles download and returns path to the raw file
179
- const rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
304
+ let rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
180
305
  update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
181
306
  });
307
+ if (requestedRows && requestedRows > 0) {
308
+ update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
309
+ let currentRows = await countRows(rawFilePath);
310
+ if (currentRows < requestedRows) {
311
+ update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
312
+ const additional = await searchEngine.search(query, { limit: 8 });
313
+ const sourceFiles = [rawFilePath];
314
+ let totalRows = currentRows;
315
+ for (const ds of additional) {
316
+ if (ds.id === topDataset.id)
317
+ continue;
318
+ try {
319
+ const dsSource = ds.source;
320
+ if (dsSource === "kaggle" && !dataIngestor.hasKaggleCredentials())
321
+ continue;
322
+ const p = await dataIngestor.ensureData(ds.id, dsSource, () => undefined);
323
+ const r = await countRows(p);
324
+ if (r <= 0)
325
+ continue;
326
+ sourceFiles.push(p);
327
+ totalRows += r;
328
+ if (totalRows >= requestedRows)
329
+ break;
330
+ }
331
+ catch {
332
+ // ignore candidate failures and continue trying
333
+ }
334
+ }
335
+ if (sourceFiles.length > 1) {
336
+ update({ progress: 67, status_text: `Fusing ${sourceFiles.length} datasets to meet row target...` });
337
+ const fusedPath = path.join(dataRoot, "fusion", `prepare_fused_${Date.now()}.feather`);
338
+ const fusionResult = await fusionEngine.fuse(sourceFiles, fusedPath, {
339
+ strategy: "concat",
340
+ dedup: true,
341
+ run_quality_after: false,
342
+ leakage_check: false,
343
+ output_format: "feather",
344
+ compression: "lz4",
345
+ preview: true,
346
+ });
347
+ rawFilePath = fusionResult.output_path;
348
+ currentRows = await countRows(rawFilePath);
349
+ }
350
+ if (currentRows < requestedRows) {
351
+ throw new Error(`Requested ${requestedRows.toLocaleString()} samples, but only ${currentRows.toLocaleString()} available across current matches. ` +
352
+ `Try broader query or enable additional sources.`);
353
+ }
354
+ update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
355
+ }
356
+ }
182
357
  update({ progress: 70, status_text: "Analyzing dataset quality..." });
183
358
  const report = await qualityAnalyzer.analyze(rawFilePath);
184
359
  // Update local metadata with quality info
@@ -234,10 +409,84 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
234
409
  type: "string",
235
410
  description: "The search query. Use -term to exclude keywords.",
236
411
  },
412
+ enable_jit: {
413
+ type: "boolean",
414
+ description: "Enable live JIT search when local library results are insufficient (default: false).",
415
+ },
416
+ },
417
+ required: ["query"],
418
+ },
419
+ },
420
+ {
421
+ name: "discover_datasets",
422
+ description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
423
+ inputSchema: {
424
+ type: "object",
425
+ properties: {
426
+ query: {
427
+ type: "string",
428
+ description: "Search query, e.g. 'credit risk'.",
429
+ },
430
+ source: {
431
+ type: "string",
432
+ enum: ["huggingface", "kaggle"],
433
+ description: "Data source to discover from.",
434
+ },
435
+ limit: {
436
+ type: "number",
437
+ description: "Max results to return (default: 10).",
438
+ },
237
439
  },
238
440
  required: ["query"],
239
441
  },
240
442
  },
443
+ {
444
+ name: "download_dataset",
445
+ description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
446
+ inputSchema: {
447
+ type: "object",
448
+ properties: {
449
+ source: {
450
+ type: "string",
451
+ enum: ["huggingface", "kaggle"],
452
+ description: "Dataset source.",
453
+ },
454
+ dataset_id: {
455
+ type: "string",
456
+ description: "Dataset ID/slug (e.g. user/dataset for Kaggle or HF).",
457
+ },
458
+ target_dir: {
459
+ type: "string",
460
+ description: "Optional target directory for downloaded files.",
461
+ }
462
+ },
463
+ required: ["source", "dataset_id"],
464
+ },
465
+ },
466
+ {
467
+ name: "configure_kaggle",
468
+ description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
469
+ inputSchema: {
470
+ type: "object",
471
+ properties: {
472
+ username: { type: "string", description: "Kaggle username" },
473
+ key: { type: "string", description: "Kaggle API key" }
474
+ },
475
+ required: ["username", "key"],
476
+ },
477
+ },
478
+ {
479
+ name: "configure_keys",
480
+ description: "One-time optional key setup for external sources (Kaggle + gated HF). Core tools do not require keys.",
481
+ inputSchema: {
482
+ type: "object",
483
+ properties: {
484
+ hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
485
+ kaggle_username: { type: "string", description: "Optional Kaggle username" },
486
+ kaggle_key: { type: "string", description: "Optional Kaggle API key" }
487
+ },
488
+ },
489
+ },
241
490
  {
242
491
  name: "get_dataset_info",
243
492
  description: "Get detailed metadata for a specific dataset by its ID. Returns comprehensive information including license, safety flags, and data characteristics.",
@@ -346,7 +595,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
346
595
  },
347
596
  {
348
597
  name: "export_dataset",
349
- description: "Export an ingested or prepared dataset to a specific local directory.",
598
+ description: "Export a dataset to a local directory. Use format='feather' (default) for 5-10× faster writes than CSV. Add fast=true to skip quality/cleaning steps.",
350
599
  inputSchema: {
351
600
  type: "object",
352
601
  properties: {
@@ -360,13 +609,93 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
360
609
  },
361
610
  format: {
362
611
  type: "string",
363
- enum: ["csv", "parquet"],
364
- description: "Desired output format (default: csv).",
612
+ enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
613
+ description: "Output format. feather (fastest), parquet (best compression), csv (human-readable). Default: feather.",
614
+ },
615
+ compression: {
616
+ type: "string",
617
+ enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
618
+ description: "Compression algorithm. Default: lz4 for feather, snappy for parquet, none for csv.",
619
+ },
620
+ fast: {
621
+ type: "boolean",
622
+ description: "Skip quality analysis and cleaning – raw export only. Much faster. Default: false.",
623
+ },
624
+ preview: {
625
+ type: "boolean",
626
+ description: "Generate a small 500-row CSV preview alongside binary exports. Default: false.",
627
+ },
628
+ sample_rows: {
629
+ type: "number",
630
+ description: "Export only this many random rows (faster for huge datasets).",
631
+ },
632
+ columns: {
633
+ type: "array",
634
+ items: { type: "string" },
635
+ description: "Export only these columns (faster for wide datasets).",
365
636
  },
366
637
  },
367
638
  required: ["dataset_id"],
368
639
  },
369
640
  },
641
+ {
642
+ name: "fuse_datasets",
643
+ description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
644
+ inputSchema: {
645
+ type: "object",
646
+ properties: {
647
+ sources: {
648
+ type: "array",
649
+ items: { type: "string" },
650
+ description: "List of dataset IDs and/or local file paths to fuse.",
651
+ },
652
+ strategy: {
653
+ type: "string",
654
+ enum: ["concat", "join"],
655
+ description: "Fusion strategy. concat appends rows; join merges on key(s).",
656
+ },
657
+ join_on: {
658
+ oneOf: [
659
+ { type: "string" },
660
+ { type: "array", items: { type: "string" } }
661
+ ],
662
+ description: "Join key(s). Required when strategy='join'.",
663
+ },
664
+ how: {
665
+ type: "string",
666
+ enum: ["inner", "left", "outer"],
667
+ description: "Join mode (only for strategy='join').",
668
+ },
669
+ dedup: {
670
+ type: "boolean",
671
+ description: "Drop exact duplicate rows after fusion.",
672
+ },
673
+ run_quality_after: {
674
+ type: "boolean",
675
+ description: "Run quality analysis on the fused output.",
676
+ },
677
+ leakage_check: {
678
+ type: "boolean",
679
+ description: "Run leakage/overlap checks across fused sources.",
680
+ },
681
+ output_format: {
682
+ type: "string",
683
+ enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
684
+ description: "Output format (default: feather).",
685
+ },
686
+ compression: {
687
+ type: "string",
688
+ enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
689
+ description: "Compression algorithm for binary outputs.",
690
+ },
691
+ preview: {
692
+ type: "boolean",
693
+ description: "Generate a small preview CSV of fused output.",
694
+ },
695
+ },
696
+ required: ["sources"],
697
+ },
698
+ },
370
699
  {
371
700
  name: "analyze_image_quality",
372
701
  description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
@@ -423,10 +752,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
423
752
  const query = String(request.params.arguments?.query);
424
753
  const limit = 5;
425
754
  const safeOnly = true; // Enable safe filter by default
755
+ const enableJIT = request.params.arguments?.enable_jit === true;
426
756
  if (!query) {
427
757
  throw new McpError(ErrorCode.InvalidParams, "Query is required");
428
758
  }
429
- const results = await searchEngine.search(query, { limit, safeOnly });
759
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
430
760
  const formattedOutput = formatSearchResults(results);
431
761
  return {
432
762
  content: [
@@ -437,6 +767,123 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
437
767
  ],
438
768
  };
439
769
  }
770
+ case "discover_datasets": {
771
+ hydrateExternalKeys();
772
+ const query = String(request.params.arguments?.query || "").trim();
773
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
774
+ const limit = Number(request.params.arguments?.limit || 10);
775
+ if (!query) {
776
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
777
+ }
778
+ try {
779
+ let results = [];
780
+ if (source === "kaggle") {
781
+ if (!dataIngestor.hasKaggleCredentials()) {
782
+ return {
783
+ content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
784
+ isError: true,
785
+ };
786
+ }
787
+ results = await kaggleSource.discover(query, limit);
788
+ }
789
+ else {
790
+ const hf = new HuggingFaceScraper();
791
+ results = await hf.scrape(Math.max(1, limit), true, query);
792
+ }
793
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
794
+ return {
795
+ content: [{ type: "text", text: formattedOutput }]
796
+ };
797
+ }
798
+ catch (error) {
799
+ return {
800
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
801
+ isError: true,
802
+ };
803
+ }
804
+ }
805
+ case "download_dataset": {
806
+ hydrateExternalKeys();
807
+ const source = String(request.params.arguments?.source || "").toLowerCase();
808
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
809
+ if (!source || !datasetId) {
810
+ throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
811
+ }
812
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
813
+ return {
814
+ content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
815
+ isError: true,
816
+ };
817
+ }
818
+ try {
819
+ const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
820
+ return {
821
+ content: [{ type: "text", text: `Download complete: ${localPath}` }]
822
+ };
823
+ }
824
+ catch (error) {
825
+ return {
826
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
827
+ isError: true,
828
+ };
829
+ }
830
+ }
831
+ case "configure_kaggle": {
832
+ const username = String(request.params.arguments?.username || "").trim();
833
+ const key = String(request.params.arguments?.key || "").trim();
834
+ if (!username || !key) {
835
+ throw new McpError(ErrorCode.InvalidParams, "username and key are required");
836
+ }
837
+ const r1 = secureKeys.set("kaggle_username", username);
838
+ const r2 = secureKeys.set("kaggle_key", key);
839
+ process.env.KAGGLE_USERNAME = username;
840
+ process.env.KAGGLE_KEY = key;
841
+ return {
842
+ content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
843
+ };
844
+ }
845
+ case "configure_keys": {
846
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
847
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
848
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
849
+ const saved = [];
850
+ const methods = [];
851
+ if (hfToken) {
852
+ const r = secureKeys.set("hf_token", hfToken);
853
+ if (r.ok) {
854
+ process.env.HF_TOKEN = hfToken;
855
+ saved.push("HF token");
856
+ if (r.method)
857
+ methods.push(r.method);
858
+ }
859
+ }
860
+ if (kaggleUsername) {
861
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
862
+ if (r.ok) {
863
+ process.env.KAGGLE_USERNAME = kaggleUsername;
864
+ saved.push("Kaggle username");
865
+ if (r.method)
866
+ methods.push(r.method);
867
+ }
868
+ }
869
+ if (kaggleKey) {
870
+ const r = secureKeys.set("kaggle_key", kaggleKey);
871
+ if (r.ok) {
872
+ process.env.KAGGLE_KEY = kaggleKey;
873
+ saved.push("Kaggle key");
874
+ if (r.method)
875
+ methods.push(r.method);
876
+ }
877
+ }
878
+ if (saved.length === 0) {
879
+ return {
880
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
881
+ };
882
+ }
883
+ return {
884
+ content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
885
+ };
886
+ }
440
887
  case "get_dataset_info": {
441
888
  const datasetId = String(request.params.arguments?.dataset_id);
442
889
  if (!datasetId) {
@@ -518,14 +965,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
518
965
  confidence: targetResult.confidence
519
966
  } : undefined;
520
967
  const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
521
- let explanation = `### 📋 Cleaning Plan for ${datasetId}\n\n`;
968
+ let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
522
969
  if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
523
- explanation += `🎯 **Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
970
+ explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
524
971
  explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
525
972
  }
526
973
  explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
527
974
  if (plan.operations.length === 0) {
528
- explanation += "No cleaning operations required.";
975
+ explanation += "No cleaning operations required.";
529
976
  }
530
977
  else {
531
978
  plan.operations.forEach((op, i) => {
@@ -546,7 +993,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
546
993
  }
547
994
  case "prepare_dataset": {
548
995
  const query = String(request.params.arguments?.query);
549
- const job = jobManager.createJob("prepare", 0, { query });
996
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
997
+ const job = jobManager.createJob("prepare", 0, { query, requirements });
550
998
  return {
551
999
  content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
552
1000
  };
@@ -577,7 +1025,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
577
1025
  case "export_dataset": {
578
1026
  const datasetId = String(request.params.arguments?.dataset_id);
579
1027
  const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
580
- const requestedFormat = request.params.arguments?.format || "csv";
1028
+ const requestedFormat = String(request.params.arguments?.format || "feather");
1029
+ const fastMode = request.params.arguments?.fast === true;
1030
+ const preview = request.params.arguments?.preview === true;
1031
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
1032
+ const columns = request.params.arguments?.columns;
1033
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
581
1034
  const dataset = metadataStore.getDataset(datasetId);
582
1035
  if (!dataset) {
583
1036
  throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
@@ -591,30 +1044,153 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
591
1044
  };
592
1045
  }
593
1046
  let sourcePath = downloadStatus.local_path;
594
- // Check if we need conversion
595
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
596
- if (currentExt !== requestedFormat) {
597
- console.error(`[Export] Format mismatch (${currentExt} vs ${requestedFormat}). Converting...`);
598
- try {
599
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
600
- sourcePath = pipelineResult.final_output_path;
601
- }
602
- catch (err) {
603
- return {
604
- content: [{ type: "text", text: `ERROR: Failed to convert dataset to ${requestedFormat}: ${err.message}` }],
605
- isError: true
606
- };
1047
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
1048
+ if (!fastMode) {
1049
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
1050
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "csv";
1051
+ if (currentExt !== pipelineFmt) {
1052
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
1053
+ try {
1054
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
1055
+ if (pipelineResult.final_output_path) {
1056
+ sourcePath = pipelineResult.final_output_path;
1057
+ }
1058
+ }
1059
+ catch (err) {
1060
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
1061
+ }
607
1062
  }
608
1063
  }
1064
+ else {
1065
+ console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
1066
+ }
1067
+ // Build export options
1068
+ const exportOpts = {};
1069
+ if (compression)
1070
+ exportOpts.compression = compression;
1071
+ if (preview)
1072
+ exportOpts.preview = true;
1073
+ if (sampleRows)
1074
+ exportOpts.sample_rows = sampleRows;
1075
+ if (columns)
1076
+ exportOpts.columns = columns;
609
1077
  try {
610
- const finalPath = await installService.install(datasetId, sourcePath, targetDir);
1078
+ // Determine output file name
1079
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
1080
+ const ext = extMap[requestedFormat] || ".feather";
1081
+ const safeName = datasetId.replace(/\//g, "_");
1082
+ const outDir = targetDir || path.join(dataRoot, "exports");
1083
+ if (!fs.existsSync(outDir))
1084
+ fs.mkdirSync(outDir, { recursive: true });
1085
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
1086
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
1087
+ // Build rich response
1088
+ let msg = `**Export complete**\n`;
1089
+ msg += `- **File**: ${result.output_path}\n`;
1090
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
1091
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
1092
+ if (result.file_size_mb !== undefined)
1093
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
1094
+ if (result.elapsed_seconds !== undefined)
1095
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
1096
+ if (result.preview_path)
1097
+ msg += `- **Preview**: ${result.preview_path}\n`;
1098
+ msg += `\n`;
1099
+ if (requestedFormat === "feather") {
1100
+ msg += `**Inspect with:**\n`;
1101
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
1102
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1103
+ }
1104
+ else if (requestedFormat === "parquet") {
1105
+ msg += `**Inspect with:**\n`;
1106
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
1107
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1108
+ }
1109
+ return { content: [{ type: "text", text: msg }] };
1110
+ }
1111
+ catch (error) {
1112
+ return {
1113
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1114
+ isError: true
1115
+ };
1116
+ }
1117
+ }
1118
+ case "fuse_datasets": {
1119
+ const rawSources = request.params.arguments?.sources;
1120
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
1121
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
1122
+ }
1123
+ const strategy = request.params.arguments?.strategy || "concat";
1124
+ const joinOn = request.params.arguments?.join_on;
1125
+ const how = request.params.arguments?.how || "inner";
1126
+ const dedup = request.params.arguments?.dedup !== false;
1127
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
1128
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
1129
+ const outputFormat = request.params.arguments?.output_format || "feather";
1130
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1131
+ const preview = request.params.arguments?.preview !== false;
1132
+ const resolvedPaths = [];
1133
+ const unresolved = [];
1134
+ for (const src of rawSources) {
1135
+ if (fs.existsSync(src)) {
1136
+ resolvedPaths.push(src);
1137
+ continue;
1138
+ }
1139
+ const status = metadataStore.getDownloadStatus(src);
1140
+ if (status?.local_path && fs.existsSync(status.local_path)) {
1141
+ resolvedPaths.push(status.local_path);
1142
+ continue;
1143
+ }
1144
+ unresolved.push(src);
1145
+ }
1146
+ if (unresolved.length > 0) {
611
1147
  return {
612
- content: [{ type: "text", text: `✅ Dataset ${datasetId} exported to: ${finalPath}` }]
1148
+ content: [{
1149
+ type: "text",
1150
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
1151
+ }],
1152
+ isError: true
613
1153
  };
614
1154
  }
1155
+ try {
1156
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
1157
+ const ext = extMap[outputFormat] || ".feather";
1158
+ const outDir = path.join(dataRoot, "fusion");
1159
+ if (!fs.existsSync(outDir))
1160
+ fs.mkdirSync(outDir, { recursive: true });
1161
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
1162
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
1163
+ strategy,
1164
+ join_on: joinOn,
1165
+ how,
1166
+ dedup,
1167
+ run_quality_after: runQualityAfter,
1168
+ leakage_check: leakageCheck,
1169
+ output_format: outputFormat,
1170
+ compression: compression,
1171
+ preview,
1172
+ });
1173
+ const nullDelta = result.stats.null_delta;
1174
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
1175
+ let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
1176
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
1177
+ msg += `- Null change: ${nullText}\n`;
1178
+ msg += `- Output: ${result.output_path}\n`;
1179
+ if (result.preview_path)
1180
+ msg += `- Preview: ${result.preview_path}\n`;
1181
+ if (result.leakage_report) {
1182
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
1183
+ if (result.leakage_report.leakage_count) {
1184
+ msg += ` (${result.leakage_report.leakage_count})`;
1185
+ }
1186
+ msg += "\n";
1187
+ }
1188
+ msg += `\nNext: run split_dataset/export_dataset on fused output.`;
1189
+ return { content: [{ type: "text", text: msg }] };
1190
+ }
615
1191
  catch (error) {
616
1192
  return {
617
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1193
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
618
1194
  isError: true
619
1195
  };
620
1196
  }
@@ -626,16 +1202,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
626
1202
  }
627
1203
  try {
628
1204
  const report = await imageAnalyzer.analyze(inputPath);
629
- let output = `## 📷 Image Quality Report\n\n`;
1205
+ let output = `## Image Quality Report\n\n`;
630
1206
  output += `- **Total Images**: ${report.total_images}\n`;
631
1207
  output += `- **Corrupted**: ${report.corrupted_count}\n`;
632
1208
  output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
633
1209
  output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
634
1210
  if (report.individual_results.length > 0) {
635
- output += `### 🔬 Sample Detail (Top 5)\n`;
1211
+ output += `### Sample Detail (Top 5)\n`;
636
1212
  report.individual_results.slice(0, 5).forEach(img => {
637
- const statusEmoji = img.status === "ok" ? (img.is_blurry ? "⚠️" : "") : "";
638
- output += `${statusEmoji} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
1213
+ const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
1214
+ output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
639
1215
  });
640
1216
  }
641
1217
  return {
@@ -656,7 +1232,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
656
1232
  }
657
1233
  try {
658
1234
  const report = await mediaAnalyzer.analyze(inputPath);
659
- let output = `## 🎬 Media Quality Report\n\n`;
1235
+ let output = `## Media Quality Report\n\n`;
660
1236
  output += `- **Total Files**: ${report.total_files}\n`;
661
1237
  output += `- **OK Files**: ${report.ok_files}\n`;
662
1238
  output += `- **Failed Files**: ${report.failed_files}\n`;
@@ -667,17 +1243,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
667
1243
  output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
668
1244
  output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
669
1245
  }
670
- output += `\n### 📊 Sample Detail (Top 5)\n`;
1246
+ output += `\n### Sample Detail (Top 5)\n`;
671
1247
  report.details.slice(0, 5).forEach(item => {
672
- const statusEmoji = item.status === "ok" ? "" : "";
1248
+ const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
673
1249
  if (item.type === "audio" && 'sample_rate' in item) {
674
- output += `${statusEmoji} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
1250
+ output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
675
1251
  }
676
1252
  else if (item.type === "video" && 'width' in item) {
677
- output += `${statusEmoji} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
1253
+ output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
678
1254
  }
679
1255
  else {
680
- output += `${statusEmoji} **${item.filename}**: ${item.error}\n`;
1256
+ output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
681
1257
  }
682
1258
  });
683
1259
  return {
@@ -708,39 +1284,39 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
708
1284
  metadata.unified_quality_report = report;
709
1285
  await metadataStore.saveDataset(metadata);
710
1286
  }
711
- let output = `# 📊 Unified Quality Report\n\n`;
1287
+ let output = `# Unified Quality Report\n\n`;
712
1288
  output += `**Dataset**: ${datasetId}\n`;
713
1289
  output += `**Modalities**: ${report.modalities.join(", ")}\n`;
714
1290
  output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
715
1291
  if (report.text_quality) {
716
- output += `## 📝 Text Quality\n`;
1292
+ output += `## Text Quality\n`;
717
1293
  output += `- Rows: ${report.text_quality.row_count}\n`;
718
1294
  output += `- Columns: ${report.text_quality.column_count}\n`;
719
1295
  output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
720
1296
  output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
721
1297
  }
722
1298
  if (report.image_quality) {
723
- output += `## 🖼️ Image Quality\n`;
1299
+ output += `## Image Quality\n`;
724
1300
  output += `- Total Images: ${report.image_quality.total_images}\n`;
725
1301
  output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
726
1302
  output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
727
1303
  output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
728
1304
  }
729
1305
  if (report.audio_quality) {
730
- output += `## 🎵 Audio Quality\n`;
1306
+ output += `## Audio Quality\n`;
731
1307
  output += `- Total Files: ${report.audio_quality.total_files}\n`;
732
1308
  output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
733
1309
  output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
734
1310
  output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
735
1311
  }
736
1312
  if (report.video_quality) {
737
- output += `## 🎬 Video Quality\n`;
1313
+ output += `## Video Quality\n`;
738
1314
  output += `- Total Files: ${report.video_quality.total_files}\n`;
739
1315
  output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
740
1316
  output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
741
1317
  output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
742
1318
  }
743
- output += `## 💡 Recommendations\n`;
1319
+ output += `## Recommendations\n`;
744
1320
  report.recommendations.forEach(rec => {
745
1321
  output += `- ${rec}\n`;
746
1322
  });
@@ -761,8 +1337,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
761
1337
  });
762
1338
  async function main() {
763
1339
  const args = process.argv.slice(2);
1340
+ hydrateExternalKeys();
1341
+ const isFuse = args.includes("fuse");
1342
+ const isDiscover = args.includes("discover");
1343
+ const isDownload = args.includes("download");
1344
+ const isConfig = args.includes("config");
764
1345
  const isSetup = args.includes("--setup") || args.includes("setup");
765
1346
  const isSilent = args.includes("--silent");
1347
+ if (process.stdin.isTTY && !isSilent) {
1348
+ printLaunchScreen();
1349
+ }
1350
+ if (isFuse) {
1351
+ await runFuseCli(args);
1352
+ return;
1353
+ }
1354
+ if (isConfig) {
1355
+ await runConfigCli(args);
1356
+ return;
1357
+ }
1358
+ if (isDiscover) {
1359
+ await runDiscoverCli(args);
1360
+ return;
1361
+ }
1362
+ if (isDownload) {
1363
+ await runDownloadCli(args);
1364
+ return;
1365
+ }
766
1366
  // If run in setup mode OR in a terminal without args (human call), show setup wizard
767
1367
  if (isSetup || (process.stdin.isTTY && args.length === 0)) {
768
1368
  await runSetupWizard(isSilent);
@@ -778,24 +1378,274 @@ async function main() {
778
1378
  console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
779
1379
  console.log("[Vesper] Main loop finished");
780
1380
  }
1381
+ async function runConfigCli(args) {
1382
+ const isKeys = args.includes("keys");
1383
+ const isKaggle = args.includes("kaggle");
1384
+ if (!(isKeys || isKaggle) || args.includes("--help")) {
1385
+ console.log("Usage: vespermcp config keys");
1386
+ console.log(" vespermcp config kaggle --username <name> --key <api_key>");
1387
+ console.log("Core Vesper tools work with zero API keys.");
1388
+ return;
1389
+ }
1390
+ const getArgValue = (name) => {
1391
+ const idx = args.findIndex(a => a === name);
1392
+ if (idx >= 0 && idx + 1 < args.length)
1393
+ return args[idx + 1];
1394
+ return undefined;
1395
+ };
1396
+ if (isKeys) {
1397
+ console.log("\nVesper Optional Keys Setup");
1398
+ console.log("(Press Enter to skip any field)\n");
1399
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1400
+ const ask = (q) => new Promise(resolve => rl.question(q, resolve));
1401
+ const current = secureKeys.getAll();
1402
+ const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
1403
+ const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
1404
+ const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
1405
+ rl.close();
1406
+ const saved = [];
1407
+ if (hfToken) {
1408
+ const res = secureKeys.set("hf_token", hfToken);
1409
+ if (res.ok) {
1410
+ process.env.HF_TOKEN = hfToken;
1411
+ saved.push("HF token");
1412
+ }
1413
+ }
1414
+ if (kaggleUsername) {
1415
+ const res = secureKeys.set("kaggle_username", kaggleUsername);
1416
+ if (res.ok) {
1417
+ process.env.KAGGLE_USERNAME = kaggleUsername;
1418
+ saved.push("Kaggle username");
1419
+ }
1420
+ }
1421
+ if (kaggleKey) {
1422
+ const res = secureKeys.set("kaggle_key", kaggleKey);
1423
+ if (res.ok) {
1424
+ process.env.KAGGLE_KEY = kaggleKey;
1425
+ saved.push("Kaggle key");
1426
+ }
1427
+ }
1428
+ if (saved.length === 0) {
1429
+ console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
1430
+ return;
1431
+ }
1432
+ console.log(`Key(s) saved securely: ${saved.join(", ")}`);
1433
+ console.log("You can now use Kaggle and gated Hugging Face datasets.");
1434
+ return;
1435
+ }
1436
+ // Backward-compatible Kaggle-specific path
1437
+ let username = getArgValue("--username") || "";
1438
+ let key = getArgValue("--key") || "";
1439
+ if (!username || !key) {
1440
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1441
+ const ask = (q) => new Promise(resolve => rl.question(q, resolve));
1442
+ if (!username)
1443
+ username = (await ask("Kaggle username: ")).trim();
1444
+ if (!key)
1445
+ key = (await ask("Kaggle key: ")).trim();
1446
+ rl.close();
1447
+ }
1448
+ if (!username || !key) {
1449
+ console.error("Missing Kaggle username/key. Aborting.");
1450
+ process.exit(1);
1451
+ }
1452
+ secureKeys.set("kaggle_username", username);
1453
+ secureKeys.set("kaggle_key", key);
1454
+ process.env.KAGGLE_USERNAME = username;
1455
+ process.env.KAGGLE_KEY = key;
1456
+ console.log("Key saved securely. You can now use Kaggle datasets.");
1457
+ }
1458
+ async function runDiscoverCli(args) {
1459
+ const getArgValue = (name) => {
1460
+ const idx = args.findIndex(a => a === name);
1461
+ if (idx >= 0 && idx + 1 < args.length)
1462
+ return args[idx + 1];
1463
+ return undefined;
1464
+ };
1465
+ const source = (getArgValue("--source") || "huggingface").toLowerCase();
1466
+ const limit = Number(getArgValue("--limit") || "10");
1467
+ const queryParts = [];
1468
+ for (let i = 0; i < args.length; i++) {
1469
+ const token = args[i];
1470
+ if (token === "discover")
1471
+ continue;
1472
+ if (token === "--source" || token === "--limit") {
1473
+ i += 1;
1474
+ continue;
1475
+ }
1476
+ if (token.startsWith("--"))
1477
+ continue;
1478
+ queryParts.push(token);
1479
+ }
1480
+ const query = queryParts.join(" ").trim();
1481
+ if (!query) {
1482
+ console.error("Usage: vespermcp discover --source kaggle \"credit risk\" --limit 10");
1483
+ process.exit(1);
1484
+ }
1485
+ if (source === "kaggle") {
1486
+ if (!dataIngestor.hasKaggleCredentials()) {
1487
+ console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
1488
+ if (process.stdin.isTTY) {
1489
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1490
+ const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
1491
+ rl.close();
1492
+ if (answer.trim().toLowerCase() === "y") {
1493
+ await runConfigCli(["config", "kaggle"]);
1494
+ }
1495
+ }
1496
+ if (!dataIngestor.hasKaggleCredentials())
1497
+ process.exit(1);
1498
+ }
1499
+ try {
1500
+ const results = await kaggleSource.discover(query, limit);
1501
+ console.log(formatSearchResults(results));
1502
+ }
1503
+ catch (error) {
1504
+ const msg = String(error?.message || error);
1505
+ if (msg.toLowerCase().includes("kaggle package not installed")) {
1506
+ console.error("Kaggle support is optional and needs the official client: pip install kaggle");
1507
+ }
1508
+ else {
1509
+ console.error(`Kaggle discover failed: ${msg}`);
1510
+ }
1511
+ process.exit(1);
1512
+ }
1513
+ return;
1514
+ }
1515
+ const hf = new HuggingFaceScraper();
1516
+ const results = await hf.scrape(limit, true, query);
1517
+ console.log(formatSearchResults(results));
1518
+ }
1519
+ async function runDownloadCli(args) {
1520
+ // Usage: vespermcp download kaggle user/dataset-name [--target-dir C:/path]
1521
+ const targetIdx = args.findIndex(a => a === "--target-dir");
1522
+ const targetDir = targetIdx >= 0 && targetIdx + 1 < args.length ? args[targetIdx + 1] : undefined;
1523
+ const nonFlags = args.filter((a, i) => {
1524
+ if (a.startsWith("--"))
1525
+ return false;
1526
+ if (targetIdx >= 0 && i === targetIdx + 1)
1527
+ return false;
1528
+ return true;
1529
+ });
1530
+ const source = (nonFlags[1] || "").toLowerCase();
1531
+ const datasetId = nonFlags[2] || "";
1532
+ if (!source || !datasetId) {
1533
+ console.error("Usage: vespermcp download kaggle <username/dataset-name> [--target-dir C:/path]");
1534
+ process.exit(1);
1535
+ }
1536
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1537
+ console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
1538
+ if (process.stdin.isTTY) {
1539
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1540
+ const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
1541
+ rl.close();
1542
+ if (answer.trim().toLowerCase() === "y") {
1543
+ await runConfigCli(["config", "kaggle"]);
1544
+ }
1545
+ }
1546
+ if (!dataIngestor.hasKaggleCredentials())
1547
+ process.exit(1);
1548
+ }
1549
+ let localPath = "";
1550
+ try {
1551
+ if (source === "kaggle" && targetDir) {
1552
+ const normalized = datasetId.includes("kaggle.com/datasets/")
1553
+ ? datasetId.split("kaggle.com/datasets/")[1].replace(/^\//, "")
1554
+ : datasetId;
1555
+ const dl = await kaggleSource.download(normalized, targetDir);
1556
+ localPath = dl.local_path;
1557
+ const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
1558
+ metadataStore.registerDownload(normalized, localPath, "completed", size);
1559
+ }
1560
+ else {
1561
+ localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
1562
+ }
1563
+ }
1564
+ catch (error) {
1565
+ const msg = String(error?.message || error);
1566
+ if (source === "kaggle" && msg.toLowerCase().includes("kaggle package not installed")) {
1567
+ console.error("Kaggle support is optional and needs the official client: pip install kaggle");
1568
+ }
1569
+ else {
1570
+ console.error(`Download failed: ${msg}`);
1571
+ }
1572
+ process.exit(1);
1573
+ }
1574
+ console.log(`Download complete: ${localPath}`);
1575
+ }
1576
+ async function runFuseCli(args) {
1577
+ const getArgValue = (name) => {
1578
+ const idx = args.findIndex(a => a === name);
1579
+ if (idx >= 0 && idx + 1 < args.length)
1580
+ return args[idx + 1];
1581
+ return undefined;
1582
+ };
1583
+ const collectListAfter = (name) => {
1584
+ const idx = args.findIndex(a => a === name);
1585
+ if (idx < 0)
1586
+ return [];
1587
+ const out = [];
1588
+ for (let i = idx + 1; i < args.length; i++) {
1589
+ if (args[i].startsWith("--"))
1590
+ break;
1591
+ out.push(args[i]);
1592
+ }
1593
+ return out;
1594
+ };
1595
+ const sources = collectListAfter("--sources");
1596
+ if (sources.length < 2) {
1597
+ console.error("Usage: vespermcp fuse --sources <file1> <file2> [more] --strategy concat|join [--on id] [--how inner|left|outer] [--dedup] [--quality] [--leakage] [--format feather|parquet|csv|jsonl|arrow]");
1598
+ process.exit(1);
1599
+ }
1600
+ const strategy = getArgValue("--strategy") || "concat";
1601
+ const onValue = getArgValue("--on");
1602
+ const joinOn = onValue ? onValue.split(",").map(s => s.trim()).filter(Boolean) : undefined;
1603
+ const how = getArgValue("--how") || "inner";
1604
+ const outputFormat = getArgValue("--format") || "feather";
1605
+ const compression = getArgValue("--compression");
1606
+ const outputPath = getArgValue("--output") || path.join(process.cwd(), `fused_${Date.now()}.${outputFormat === "arrow" ? "arrow" : outputFormat}`);
1607
+ const dedup = args.includes("--dedup");
1608
+ const runQualityAfter = args.includes("--quality");
1609
+ const leakageCheck = args.includes("--leakage");
1610
+ const preview = !args.includes("--no-preview");
1611
+ const result = await fusionEngine.fuse(sources, outputPath, {
1612
+ strategy,
1613
+ join_on: joinOn,
1614
+ how,
1615
+ dedup,
1616
+ run_quality_after: runQualityAfter,
1617
+ leakage_check: leakageCheck,
1618
+ output_format: outputFormat,
1619
+ compression,
1620
+ preview,
1621
+ });
1622
+ const nullDelta = result.stats.null_delta;
1623
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
1624
+ console.log(`Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).`);
1625
+ console.log(`Null increase: ${nullText}`);
1626
+ console.log(`Output: ${result.output_path}`);
1627
+ if (result.preview_path)
1628
+ console.log(`Preview saved: ${result.preview_path}`);
1629
+ console.log("Next: run vespermcp split/export on the fused dataset");
1630
+ }
781
1631
  async function runSetupWizard(silent = false) {
782
1632
  const configManager = new ConfigManager();
783
1633
  if (!silent) {
784
- console.log(`\n🚀 Vesper MCP - Universal Setup`);
1634
+ console.log(`\nVesper MCP - Universal Setup`);
785
1635
  console.log(`================================`);
786
1636
  console.log(`Installing to all detected coding agents...\n`);
787
1637
  }
788
- const result = await configManager.installToAll();
1638
+ const result = await runWithSpinner("Installing to detected coding agents", () => configManager.installToAll());
789
1639
  if (result.success.length === 0 && result.failed.length === 0) {
790
1640
  if (!silent) {
791
- console.log("\n❌ No supported agents detected.");
1641
+ console.log("\nNo supported agents detected.");
792
1642
  console.log("Supported agents: Claude Code, Claude Desktop, Cursor, VS Code, Codex, Antigravity");
793
1643
  console.log("\nMake sure at least one is installed, then try again.");
794
1644
  }
795
1645
  return;
796
1646
  }
797
1647
  if (!silent) {
798
- console.log("Setup complete! Please RESTART your IDE(s) to apply changes.");
1648
+ console.log("Setup complete! Please RESTART your IDE(s) to apply changes.");
799
1649
  }
800
1650
  }
801
1651
  main().catch((error) => {