@vespermcp/mcp-server 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -5,10 +5,13 @@ import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } f
5
5
  import { fileURLToPath } from "url";
6
6
  import path from "path";
7
7
  import fs from "fs";
8
+ import { spawn } from "child_process";
8
9
  import { MetadataStore } from "./metadata/store.js";
9
10
  import { VectorStore } from "./search/vector-store.js";
10
11
  import { Embedder } from "./search/embedder.js";
11
12
  import { SearchEngine } from "./search/engine.js";
13
+ import { HuggingFaceScraper } from "./metadata/scraper.js";
14
+ import { KaggleSource } from "./metadata/kaggle-source.js";
12
15
  import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
13
16
  import { JobManager } from "./jobs/manager.js";
14
17
  import { QualityAnalyzer } from "./quality/analyzer.js";
@@ -17,6 +20,7 @@ import { DataCleaner } from "./cleaning/cleaner.js";
17
20
  import { PipelineExecutor } from "./cleaning/executor.js";
18
21
  import { DataSplitter } from "./splitting/splitter.js";
19
22
  import { DataExporter } from "./export/exporter.js";
23
+ import { DataFusionEngine } from "./fusion/engine.js";
20
24
  import { DataIngestor } from "./ingestion/ingestor.js";
21
25
  import { InstallService } from "./install/install-service.js";
22
26
  import { CacheService, MockRedisProvider } from "./cache/service.js";
@@ -24,6 +28,8 @@ import { ImageAnalyzer } from "./quality/image-analyzer.js";
24
28
  import { MediaAnalyzer } from "./quality/media-analyzer.js";
25
29
  import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
26
30
  import { ConfigManager } from "./config/config-manager.js";
31
+ import { SecureKeysManager } from "./config/secure-keys.js";
32
+ import readline from "readline";
27
33
  import os from "os";
28
34
  // Determine absolute paths relative to the compiled script
29
35
  const __filename = fileURLToPath(import.meta.url);
@@ -49,6 +55,50 @@ function logError(err, context) {
49
55
  fs.appendFileSync(errorLogPath, msg);
50
56
  console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
51
57
  }
58
+ function extractRequestedRows(query, requirements) {
59
+ const text = `${query || ""} ${requirements || ""}`.toLowerCase();
60
+ const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
61
+ if (explicit) {
62
+ const n = Number(explicit[1].replace(/[\s,]/g, ""));
63
+ if (Number.isFinite(n) && n > 0)
64
+ return n;
65
+ }
66
+ const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
67
+ .map(m => Number(m[0]))
68
+ .filter(n => Number.isFinite(n) && n > 0);
69
+ if (allNums.length > 0)
70
+ return Math.max(...allNums);
71
+ return undefined;
72
+ }
73
+ function runPythonJson(scriptPath, args) {
74
+ const pyCmd = process.platform === "win32" ? "py" : "python";
75
+ return new Promise((resolve, reject) => {
76
+ const proc = spawn(pyCmd, [scriptPath, ...args]);
77
+ let stdout = "";
78
+ let stderr = "";
79
+ proc.stdout.on("data", (d) => (stdout += d.toString()));
80
+ proc.stderr.on("data", (d) => (stderr += d.toString()));
81
+ proc.on("close", (code) => {
82
+ if (code !== 0) {
83
+ reject(new Error(stderr || stdout || `Python exited with ${code}`));
84
+ return;
85
+ }
86
+ try {
87
+ resolve(JSON.parse(stdout));
88
+ }
89
+ catch {
90
+ reject(new Error(`Invalid JSON from python helper: ${stdout}`));
91
+ }
92
+ });
93
+ });
94
+ }
95
+ async function countRows(filePath) {
96
+ const scriptPath = path.join(dataRoot, "python", "row_count.py");
97
+ const result = await runPythonJson(scriptPath, [filePath]);
98
+ if (!result.ok)
99
+ throw new Error(result.error || "Failed to count rows");
100
+ return Number(result.rows || 0);
101
+ }
52
102
  /**
53
103
  * Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
54
104
  */
@@ -105,6 +155,21 @@ const dataCleaner = new DataCleaner(__dirname);
105
155
  const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
106
156
  const dataSplitter = new DataSplitter(__dirname);
107
157
  const dataExporter = new DataExporter(__dirname);
158
+ const fusionEngine = new DataFusionEngine(__dirname);
159
+ const kaggleSource = new KaggleSource(__dirname);
160
+ const secureKeys = new SecureKeysManager(__dirname);
161
+ function hydrateExternalKeys() {
162
+ const keys = secureKeys.getAll();
163
+ if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
164
+ process.env.HF_TOKEN = String(keys.hf_token);
165
+ }
166
+ if (!process.env.KAGGLE_USERNAME && keys.kaggle_username) {
167
+ process.env.KAGGLE_USERNAME = String(keys.kaggle_username);
168
+ }
169
+ if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
170
+ process.env.KAGGLE_KEY = String(keys.kaggle_key);
171
+ }
172
+ }
108
173
  // CRITICAL FIX: Pass __dirname (build directory) to analyzers
109
174
  // Python scripts are in build/python/, so analyzers should look relative to build/
110
175
  // NOT relative to project root (appRoot)
@@ -136,7 +201,7 @@ jobManager.on("processJob", async (job, execute) => {
136
201
  console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
137
202
  const metadata = job.metadata ? JSON.parse(job.metadata) : {};
138
203
  switch (job.type) {
139
- case "prepare": return await handlePrepareJob(job.id, metadata.query);
204
+ case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
140
205
  case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
141
206
  default: throw new Error(`Unhandled job type: ${job.type}`);
142
207
  }
@@ -154,8 +219,9 @@ jobManager.on("processJob", async (job, execute) => {
154
219
  /**
155
220
  * Logic for preparing a dataset (Search + Ingest + Process)
156
221
  */
157
- async function handlePrepareJob(jobId, query) {
222
+ async function handlePrepareJob(jobId, query, requirements) {
158
223
  const update = (updates) => jobManager.updateJob(jobId, updates);
224
+ const requestedRows = extractRequestedRows(query, requirements);
159
225
  update({ progress: 10, status_text: "Searching for best dataset matching query..." });
160
226
  const results = await searchEngine.search(query, { limit: 1 });
161
227
  if (results.length === 0) {
@@ -176,9 +242,59 @@ async function handlePrepareJob(jobId, query) {
176
242
  }
177
243
  update({ progress: 30, status_text: `Starting download from ${source}...` });
178
244
  // ensureData handles download and returns path to the raw file
179
- const rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
245
+ let rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
180
246
  update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
181
247
  });
248
+ if (requestedRows && requestedRows > 0) {
249
+ update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
250
+ let currentRows = await countRows(rawFilePath);
251
+ if (currentRows < requestedRows) {
252
+ update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
253
+ const additional = await searchEngine.search(query, { limit: 8 });
254
+ const sourceFiles = [rawFilePath];
255
+ let totalRows = currentRows;
256
+ for (const ds of additional) {
257
+ if (ds.id === topDataset.id)
258
+ continue;
259
+ try {
260
+ const dsSource = ds.source;
261
+ if (dsSource === "kaggle" && !dataIngestor.hasKaggleCredentials())
262
+ continue;
263
+ const p = await dataIngestor.ensureData(ds.id, dsSource, () => undefined);
264
+ const r = await countRows(p);
265
+ if (r <= 0)
266
+ continue;
267
+ sourceFiles.push(p);
268
+ totalRows += r;
269
+ if (totalRows >= requestedRows)
270
+ break;
271
+ }
272
+ catch {
273
+ // ignore candidate failures and continue trying
274
+ }
275
+ }
276
+ if (sourceFiles.length > 1) {
277
+ update({ progress: 67, status_text: `Fusing ${sourceFiles.length} datasets to meet row target...` });
278
+ const fusedPath = path.join(dataRoot, "fusion", `prepare_fused_${Date.now()}.feather`);
279
+ const fusionResult = await fusionEngine.fuse(sourceFiles, fusedPath, {
280
+ strategy: "concat",
281
+ dedup: true,
282
+ run_quality_after: false,
283
+ leakage_check: false,
284
+ output_format: "feather",
285
+ compression: "lz4",
286
+ preview: true,
287
+ });
288
+ rawFilePath = fusionResult.output_path;
289
+ currentRows = await countRows(rawFilePath);
290
+ }
291
+ if (currentRows < requestedRows) {
292
+ throw new Error(`Requested ${requestedRows.toLocaleString()} samples, but only ${currentRows.toLocaleString()} available across current matches. ` +
293
+ `Try broader query or enable additional sources.`);
294
+ }
295
+ update({ progress: 69, status_text: `✅ Sample target met: ${currentRows.toLocaleString()} rows` });
296
+ }
297
+ }
182
298
  update({ progress: 70, status_text: "Analyzing dataset quality..." });
183
299
  const report = await qualityAnalyzer.analyze(rawFilePath);
184
300
  // Update local metadata with quality info
@@ -234,10 +350,84 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
234
350
  type: "string",
235
351
  description: "The search query. Use -term to exclude keywords.",
236
352
  },
353
+ enable_jit: {
354
+ type: "boolean",
355
+ description: "Enable live JIT search when local library results are insufficient (default: false).",
356
+ },
357
+ },
358
+ required: ["query"],
359
+ },
360
+ },
361
+ {
362
+ name: "discover_datasets",
363
+ description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
364
+ inputSchema: {
365
+ type: "object",
366
+ properties: {
367
+ query: {
368
+ type: "string",
369
+ description: "Search query, e.g. 'credit risk'.",
370
+ },
371
+ source: {
372
+ type: "string",
373
+ enum: ["huggingface", "kaggle"],
374
+ description: "Data source to discover from.",
375
+ },
376
+ limit: {
377
+ type: "number",
378
+ description: "Max results to return (default: 10).",
379
+ },
237
380
  },
238
381
  required: ["query"],
239
382
  },
240
383
  },
384
+ {
385
+ name: "download_dataset",
386
+ description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
387
+ inputSchema: {
388
+ type: "object",
389
+ properties: {
390
+ source: {
391
+ type: "string",
392
+ enum: ["huggingface", "kaggle"],
393
+ description: "Dataset source.",
394
+ },
395
+ dataset_id: {
396
+ type: "string",
397
+ description: "Dataset ID/slug (e.g. user/dataset for Kaggle or HF).",
398
+ },
399
+ target_dir: {
400
+ type: "string",
401
+ description: "Optional target directory for downloaded files.",
402
+ }
403
+ },
404
+ required: ["source", "dataset_id"],
405
+ },
406
+ },
407
+ {
408
+ name: "configure_kaggle",
409
+ description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
410
+ inputSchema: {
411
+ type: "object",
412
+ properties: {
413
+ username: { type: "string", description: "Kaggle username" },
414
+ key: { type: "string", description: "Kaggle API key" }
415
+ },
416
+ required: ["username", "key"],
417
+ },
418
+ },
419
+ {
420
+ name: "configure_keys",
421
+ description: "One-time optional key setup for external sources (Kaggle + gated HF). Core tools do not require keys.",
422
+ inputSchema: {
423
+ type: "object",
424
+ properties: {
425
+ hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
426
+ kaggle_username: { type: "string", description: "Optional Kaggle username" },
427
+ kaggle_key: { type: "string", description: "Optional Kaggle API key" }
428
+ },
429
+ },
430
+ },
241
431
  {
242
432
  name: "get_dataset_info",
243
433
  description: "Get detailed metadata for a specific dataset by its ID. Returns comprehensive information including license, safety flags, and data characteristics.",
@@ -346,7 +536,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
346
536
  },
347
537
  {
348
538
  name: "export_dataset",
349
- description: "Export an ingested or prepared dataset to a specific local directory.",
539
+ description: "Export a dataset to a local directory. Use format='feather' (default) for 5-10× faster writes than CSV. Add fast=true to skip quality/cleaning steps.",
350
540
  inputSchema: {
351
541
  type: "object",
352
542
  properties: {
@@ -360,13 +550,93 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
360
550
  },
361
551
  format: {
362
552
  type: "string",
363
- enum: ["csv", "parquet"],
364
- description: "Desired output format (default: csv).",
553
+ enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
554
+ description: "Output format. feather (fastest), parquet (best compression), csv (human-readable). Default: feather.",
555
+ },
556
+ compression: {
557
+ type: "string",
558
+ enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
559
+ description: "Compression algorithm. Default: lz4 for feather, snappy for parquet, none for csv.",
560
+ },
561
+ fast: {
562
+ type: "boolean",
563
+ description: "Skip quality analysis and cleaning – raw export only. Much faster. Default: false.",
564
+ },
565
+ preview: {
566
+ type: "boolean",
567
+ description: "Generate a small 500-row CSV preview alongside binary exports. Default: false.",
568
+ },
569
+ sample_rows: {
570
+ type: "number",
571
+ description: "Export only this many random rows (faster for huge datasets).",
572
+ },
573
+ columns: {
574
+ type: "array",
575
+ items: { type: "string" },
576
+ description: "Export only these columns (faster for wide datasets).",
365
577
  },
366
578
  },
367
579
  required: ["dataset_id"],
368
580
  },
369
581
  },
582
+ {
583
+ name: "fuse_datasets",
584
+ description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
585
+ inputSchema: {
586
+ type: "object",
587
+ properties: {
588
+ sources: {
589
+ type: "array",
590
+ items: { type: "string" },
591
+ description: "List of dataset IDs and/or local file paths to fuse.",
592
+ },
593
+ strategy: {
594
+ type: "string",
595
+ enum: ["concat", "join"],
596
+ description: "Fusion strategy. concat appends rows; join merges on key(s).",
597
+ },
598
+ join_on: {
599
+ oneOf: [
600
+ { type: "string" },
601
+ { type: "array", items: { type: "string" } }
602
+ ],
603
+ description: "Join key(s). Required when strategy='join'.",
604
+ },
605
+ how: {
606
+ type: "string",
607
+ enum: ["inner", "left", "outer"],
608
+ description: "Join mode (only for strategy='join').",
609
+ },
610
+ dedup: {
611
+ type: "boolean",
612
+ description: "Drop exact duplicate rows after fusion.",
613
+ },
614
+ run_quality_after: {
615
+ type: "boolean",
616
+ description: "Run quality analysis on the fused output.",
617
+ },
618
+ leakage_check: {
619
+ type: "boolean",
620
+ description: "Run leakage/overlap checks across fused sources.",
621
+ },
622
+ output_format: {
623
+ type: "string",
624
+ enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
625
+ description: "Output format (default: feather).",
626
+ },
627
+ compression: {
628
+ type: "string",
629
+ enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
630
+ description: "Compression algorithm for binary outputs.",
631
+ },
632
+ preview: {
633
+ type: "boolean",
634
+ description: "Generate a small preview CSV of fused output.",
635
+ },
636
+ },
637
+ required: ["sources"],
638
+ },
639
+ },
370
640
  {
371
641
  name: "analyze_image_quality",
372
642
  description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
@@ -423,10 +693,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
423
693
  const query = String(request.params.arguments?.query);
424
694
  const limit = 5;
425
695
  const safeOnly = true; // Enable safe filter by default
696
+ const enableJIT = request.params.arguments?.enable_jit === true;
426
697
  if (!query) {
427
698
  throw new McpError(ErrorCode.InvalidParams, "Query is required");
428
699
  }
429
- const results = await searchEngine.search(query, { limit, safeOnly });
700
+ const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
430
701
  const formattedOutput = formatSearchResults(results);
431
702
  return {
432
703
  content: [
@@ -437,6 +708,123 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
437
708
  ],
438
709
  };
439
710
  }
711
+ case "discover_datasets": {
712
+ hydrateExternalKeys();
713
+ const query = String(request.params.arguments?.query || "").trim();
714
+ const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
715
+ const limit = Number(request.params.arguments?.limit || 10);
716
+ if (!query) {
717
+ throw new McpError(ErrorCode.InvalidParams, "query is required");
718
+ }
719
+ try {
720
+ let results = [];
721
+ if (source === "kaggle") {
722
+ if (!dataIngestor.hasKaggleCredentials()) {
723
+ return {
724
+ content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
725
+ isError: true,
726
+ };
727
+ }
728
+ results = await kaggleSource.discover(query, limit);
729
+ }
730
+ else {
731
+ const hf = new HuggingFaceScraper();
732
+ results = await hf.scrape(Math.max(1, limit), true, query);
733
+ }
734
+ const formattedOutput = formatSearchResults(results.slice(0, limit));
735
+ return {
736
+ content: [{ type: "text", text: formattedOutput }]
737
+ };
738
+ }
739
+ catch (error) {
740
+ return {
741
+ content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
742
+ isError: true,
743
+ };
744
+ }
745
+ }
746
+ case "download_dataset": {
747
+ hydrateExternalKeys();
748
+ const source = String(request.params.arguments?.source || "").toLowerCase();
749
+ const datasetId = String(request.params.arguments?.dataset_id || "").trim();
750
+ if (!source || !datasetId) {
751
+ throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
752
+ }
753
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
754
+ return {
755
+ content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
756
+ isError: true,
757
+ };
758
+ }
759
+ try {
760
+ const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
761
+ return {
762
+ content: [{ type: "text", text: `✅ Download complete: ${localPath}` }]
763
+ };
764
+ }
765
+ catch (error) {
766
+ return {
767
+ content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
768
+ isError: true,
769
+ };
770
+ }
771
+ }
772
+ case "configure_kaggle": {
773
+ const username = String(request.params.arguments?.username || "").trim();
774
+ const key = String(request.params.arguments?.key || "").trim();
775
+ if (!username || !key) {
776
+ throw new McpError(ErrorCode.InvalidParams, "username and key are required");
777
+ }
778
+ const r1 = secureKeys.set("kaggle_username", username);
779
+ const r2 = secureKeys.set("kaggle_key", key);
780
+ process.env.KAGGLE_USERNAME = username;
781
+ process.env.KAGGLE_KEY = key;
782
+ return {
783
+ content: [{ type: "text", text: `✅ Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
784
+ };
785
+ }
786
+ case "configure_keys": {
787
+ const hfToken = String(request.params.arguments?.hf_token || "").trim();
788
+ const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
789
+ const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
790
+ const saved = [];
791
+ const methods = [];
792
+ if (hfToken) {
793
+ const r = secureKeys.set("hf_token", hfToken);
794
+ if (r.ok) {
795
+ process.env.HF_TOKEN = hfToken;
796
+ saved.push("HF token");
797
+ if (r.method)
798
+ methods.push(r.method);
799
+ }
800
+ }
801
+ if (kaggleUsername) {
802
+ const r = secureKeys.set("kaggle_username", kaggleUsername);
803
+ if (r.ok) {
804
+ process.env.KAGGLE_USERNAME = kaggleUsername;
805
+ saved.push("Kaggle username");
806
+ if (r.method)
807
+ methods.push(r.method);
808
+ }
809
+ }
810
+ if (kaggleKey) {
811
+ const r = secureKeys.set("kaggle_key", kaggleKey);
812
+ if (r.ok) {
813
+ process.env.KAGGLE_KEY = kaggleKey;
814
+ saved.push("Kaggle key");
815
+ if (r.method)
816
+ methods.push(r.method);
817
+ }
818
+ }
819
+ if (saved.length === 0) {
820
+ return {
821
+ content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
822
+ };
823
+ }
824
+ return {
825
+ content: [{ type: "text", text: `✅ Key saved securely. Updated: ${saved.join(", ")}.` }]
826
+ };
827
+ }
440
828
  case "get_dataset_info": {
441
829
  const datasetId = String(request.params.arguments?.dataset_id);
442
830
  if (!datasetId) {
@@ -546,7 +934,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
546
934
  }
547
935
  case "prepare_dataset": {
548
936
  const query = String(request.params.arguments?.query);
549
- const job = jobManager.createJob("prepare", 0, { query });
937
+ const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
938
+ const job = jobManager.createJob("prepare", 0, { query, requirements });
550
939
  return {
551
940
  content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
552
941
  };
@@ -577,7 +966,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
577
966
  case "export_dataset": {
578
967
  const datasetId = String(request.params.arguments?.dataset_id);
579
968
  const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
580
- const requestedFormat = request.params.arguments?.format || "csv";
969
+ const requestedFormat = String(request.params.arguments?.format || "feather");
970
+ const fastMode = request.params.arguments?.fast === true;
971
+ const preview = request.params.arguments?.preview === true;
972
+ const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
973
+ const columns = request.params.arguments?.columns;
974
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
581
975
  const dataset = metadataStore.getDataset(datasetId);
582
976
  if (!dataset) {
583
977
  throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
@@ -591,30 +985,153 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
591
985
  };
592
986
  }
593
987
  let sourcePath = downloadStatus.local_path;
594
- // Check if we need conversion
595
- const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
596
- if (currentExt !== requestedFormat) {
597
- console.error(`[Export] Format mismatch (${currentExt} vs ${requestedFormat}). Converting...`);
598
- try {
599
- const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
600
- sourcePath = pipelineResult.final_output_path;
601
- }
602
- catch (err) {
603
- return {
604
- content: [{ type: "text", text: `ERROR: Failed to convert dataset to ${requestedFormat}: ${err.message}` }],
605
- isError: true
606
- };
988
+ // If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
989
+ if (!fastMode) {
990
+ const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
991
+ const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "csv";
992
+ if (currentExt !== pipelineFmt) {
993
+ console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
994
+ try {
995
+ const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
996
+ if (pipelineResult.final_output_path) {
997
+ sourcePath = pipelineResult.final_output_path;
998
+ }
999
+ }
1000
+ catch (err) {
1001
+ console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
1002
+ }
607
1003
  }
608
1004
  }
1005
+ else {
1006
+ console.error(`[Export] ⚡ Fast mode – skipping quality analysis and cleaning`);
1007
+ }
1008
+ // Build export options
1009
+ const exportOpts = {};
1010
+ if (compression)
1011
+ exportOpts.compression = compression;
1012
+ if (preview)
1013
+ exportOpts.preview = true;
1014
+ if (sampleRows)
1015
+ exportOpts.sample_rows = sampleRows;
1016
+ if (columns)
1017
+ exportOpts.columns = columns;
609
1018
  try {
610
- const finalPath = await installService.install(datasetId, sourcePath, targetDir);
1019
+ // Determine output file name
1020
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
1021
+ const ext = extMap[requestedFormat] || ".feather";
1022
+ const safeName = datasetId.replace(/\//g, "_");
1023
+ const outDir = targetDir || path.join(dataRoot, "exports");
1024
+ if (!fs.existsSync(outDir))
1025
+ fs.mkdirSync(outDir, { recursive: true });
1026
+ const outputFile = path.join(outDir, `${safeName}${ext}`);
1027
+ const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
1028
+ // Build rich response
1029
+ let msg = `✅ **Export complete**\n`;
1030
+ msg += `- **File**: ${result.output_path}\n`;
1031
+ msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
1032
+ msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
1033
+ if (result.file_size_mb !== undefined)
1034
+ msg += `- **Size**: ${result.file_size_mb} MB\n`;
1035
+ if (result.elapsed_seconds !== undefined)
1036
+ msg += `- **Time**: ${result.elapsed_seconds}s\n`;
1037
+ if (result.preview_path)
1038
+ msg += `- **Preview**: ${result.preview_path}\n`;
1039
+ msg += `\n`;
1040
+ if (requestedFormat === "feather") {
1041
+ msg += `💡 **Inspect with:**\n`;
1042
+ msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
1043
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1044
+ }
1045
+ else if (requestedFormat === "parquet") {
1046
+ msg += `💡 **Inspect with:**\n`;
1047
+ msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
1048
+ msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
1049
+ }
1050
+ return { content: [{ type: "text", text: msg }] };
1051
+ }
1052
+ catch (error) {
611
1053
  return {
612
- content: [{ type: "text", text: `✅ Dataset ${datasetId} exported to: ${finalPath}` }]
1054
+ content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1055
+ isError: true
613
1056
  };
614
1057
  }
1058
+ }
1059
+ case "fuse_datasets": {
1060
+ const rawSources = request.params.arguments?.sources;
1061
+ if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
1062
+ throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
1063
+ }
1064
+ const strategy = request.params.arguments?.strategy || "concat";
1065
+ const joinOn = request.params.arguments?.join_on;
1066
+ const how = request.params.arguments?.how || "inner";
1067
+ const dedup = request.params.arguments?.dedup !== false;
1068
+ const runQualityAfter = request.params.arguments?.run_quality_after !== false;
1069
+ const leakageCheck = request.params.arguments?.leakage_check !== false;
1070
+ const outputFormat = request.params.arguments?.output_format || "feather";
1071
+ const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
1072
+ const preview = request.params.arguments?.preview !== false;
1073
+ const resolvedPaths = [];
1074
+ const unresolved = [];
1075
+ for (const src of rawSources) {
1076
+ if (fs.existsSync(src)) {
1077
+ resolvedPaths.push(src);
1078
+ continue;
1079
+ }
1080
+ const status = metadataStore.getDownloadStatus(src);
1081
+ if (status?.local_path && fs.existsSync(status.local_path)) {
1082
+ resolvedPaths.push(status.local_path);
1083
+ continue;
1084
+ }
1085
+ unresolved.push(src);
1086
+ }
1087
+ if (unresolved.length > 0) {
1088
+ return {
1089
+ content: [{
1090
+ type: "text",
1091
+ text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
1092
+ }],
1093
+ isError: true
1094
+ };
1095
+ }
1096
+ try {
1097
+ const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
1098
+ const ext = extMap[outputFormat] || ".feather";
1099
+ const outDir = path.join(dataRoot, "fusion");
1100
+ if (!fs.existsSync(outDir))
1101
+ fs.mkdirSync(outDir, { recursive: true });
1102
+ const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
1103
+ const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
1104
+ strategy,
1105
+ join_on: joinOn,
1106
+ how,
1107
+ dedup,
1108
+ run_quality_after: runQualityAfter,
1109
+ leakage_check: leakageCheck,
1110
+ output_format: outputFormat,
1111
+ compression: compression,
1112
+ preview,
1113
+ });
1114
+ const nullDelta = result.stats.null_delta;
1115
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
1116
+ let msg = `✅ Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
1117
+ msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
1118
+ msg += `- Null change: ${nullText}\n`;
1119
+ msg += `- Output: ${result.output_path}\n`;
1120
+ if (result.preview_path)
1121
+ msg += `- Preview: ${result.preview_path}\n`;
1122
+ if (result.leakage_report) {
1123
+ msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
1124
+ if (result.leakage_report.leakage_count) {
1125
+ msg += ` (${result.leakage_report.leakage_count})`;
1126
+ }
1127
+ msg += "\n";
1128
+ }
1129
+ msg += `\nNext: run split_dataset/export_dataset on fused output.`;
1130
+ return { content: [{ type: "text", text: msg }] };
1131
+ }
615
1132
  catch (error) {
616
1133
  return {
617
- content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
1134
+ content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
618
1135
  isError: true
619
1136
  };
620
1137
  }
@@ -761,8 +1278,29 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
761
1278
  });
762
1279
  async function main() {
763
1280
  const args = process.argv.slice(2);
1281
+ hydrateExternalKeys();
1282
+ const isFuse = args.includes("fuse");
1283
+ const isDiscover = args.includes("discover");
1284
+ const isDownload = args.includes("download");
1285
+ const isConfig = args.includes("config");
764
1286
  const isSetup = args.includes("--setup") || args.includes("setup");
765
1287
  const isSilent = args.includes("--silent");
1288
+ if (isFuse) {
1289
+ await runFuseCli(args);
1290
+ return;
1291
+ }
1292
+ if (isConfig) {
1293
+ await runConfigCli(args);
1294
+ return;
1295
+ }
1296
+ if (isDiscover) {
1297
+ await runDiscoverCli(args);
1298
+ return;
1299
+ }
1300
+ if (isDownload) {
1301
+ await runDownloadCli(args);
1302
+ return;
1303
+ }
766
1304
  // If run in setup mode OR in a terminal without args (human call), show setup wizard
767
1305
  if (isSetup || (process.stdin.isTTY && args.length === 0)) {
768
1306
  await runSetupWizard(isSilent);
@@ -778,6 +1316,256 @@ async function main() {
778
1316
  console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
779
1317
  console.log("[Vesper] Main loop finished");
780
1318
  }
1319
+ async function runConfigCli(args) {
1320
+ const isKeys = args.includes("keys");
1321
+ const isKaggle = args.includes("kaggle");
1322
+ if (!(isKeys || isKaggle) || args.includes("--help")) {
1323
+ console.log("Usage: vespermcp config keys");
1324
+ console.log(" vespermcp config kaggle --username <name> --key <api_key>");
1325
+ console.log("Core Vesper tools work with zero API keys.");
1326
+ return;
1327
+ }
1328
+ const getArgValue = (name) => {
1329
+ const idx = args.findIndex(a => a === name);
1330
+ if (idx >= 0 && idx + 1 < args.length)
1331
+ return args[idx + 1];
1332
+ return undefined;
1333
+ };
1334
+ if (isKeys) {
1335
+ console.log("\n🔐 Vesper Optional Keys Setup");
1336
+ console.log("(Press Enter to skip any field)\n");
1337
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1338
+ const ask = (q) => new Promise(resolve => rl.question(q, resolve));
1339
+ const current = secureKeys.getAll();
1340
+ const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
1341
+ const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
1342
+ const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
1343
+ rl.close();
1344
+ const saved = [];
1345
+ if (hfToken) {
1346
+ const res = secureKeys.set("hf_token", hfToken);
1347
+ if (res.ok) {
1348
+ process.env.HF_TOKEN = hfToken;
1349
+ saved.push("HF token");
1350
+ }
1351
+ }
1352
+ if (kaggleUsername) {
1353
+ const res = secureKeys.set("kaggle_username", kaggleUsername);
1354
+ if (res.ok) {
1355
+ process.env.KAGGLE_USERNAME = kaggleUsername;
1356
+ saved.push("Kaggle username");
1357
+ }
1358
+ }
1359
+ if (kaggleKey) {
1360
+ const res = secureKeys.set("kaggle_key", kaggleKey);
1361
+ if (res.ok) {
1362
+ process.env.KAGGLE_KEY = kaggleKey;
1363
+ saved.push("Kaggle key");
1364
+ }
1365
+ }
1366
+ if (saved.length === 0) {
1367
+ console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
1368
+ return;
1369
+ }
1370
+ console.log(`✅ Key(s) saved securely: ${saved.join(", ")}`);
1371
+ console.log("You can now use Kaggle and gated Hugging Face datasets.");
1372
+ return;
1373
+ }
1374
+ // Backward-compatible Kaggle-specific path
1375
+ let username = getArgValue("--username") || "";
1376
+ let key = getArgValue("--key") || "";
1377
+ if (!username || !key) {
1378
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1379
+ const ask = (q) => new Promise(resolve => rl.question(q, resolve));
1380
+ if (!username)
1381
+ username = (await ask("Kaggle username: ")).trim();
1382
+ if (!key)
1383
+ key = (await ask("Kaggle key: ")).trim();
1384
+ rl.close();
1385
+ }
1386
+ if (!username || !key) {
1387
+ console.error("Missing Kaggle username/key. Aborting.");
1388
+ process.exit(1);
1389
+ }
1390
+ secureKeys.set("kaggle_username", username);
1391
+ secureKeys.set("kaggle_key", key);
1392
+ process.env.KAGGLE_USERNAME = username;
1393
+ process.env.KAGGLE_KEY = key;
1394
+ console.log("✅ Key saved securely. You can now use Kaggle datasets.");
1395
+ }
1396
+ async function runDiscoverCli(args) {
1397
+ const getArgValue = (name) => {
1398
+ const idx = args.findIndex(a => a === name);
1399
+ if (idx >= 0 && idx + 1 < args.length)
1400
+ return args[idx + 1];
1401
+ return undefined;
1402
+ };
1403
+ const source = (getArgValue("--source") || "huggingface").toLowerCase();
1404
+ const limit = Number(getArgValue("--limit") || "10");
1405
+ const queryParts = [];
1406
+ for (let i = 0; i < args.length; i++) {
1407
+ const token = args[i];
1408
+ if (token === "discover")
1409
+ continue;
1410
+ if (token === "--source" || token === "--limit") {
1411
+ i += 1;
1412
+ continue;
1413
+ }
1414
+ if (token.startsWith("--"))
1415
+ continue;
1416
+ queryParts.push(token);
1417
+ }
1418
+ const query = queryParts.join(" ").trim();
1419
+ if (!query) {
1420
+ console.error("Usage: vespermcp discover --source kaggle \"credit risk\" --limit 10");
1421
+ process.exit(1);
1422
+ }
1423
+ if (source === "kaggle") {
1424
+ if (!dataIngestor.hasKaggleCredentials()) {
1425
+ console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
1426
+ if (process.stdin.isTTY) {
1427
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1428
+ const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
1429
+ rl.close();
1430
+ if (answer.trim().toLowerCase() === "y") {
1431
+ await runConfigCli(["config", "kaggle"]);
1432
+ }
1433
+ }
1434
+ if (!dataIngestor.hasKaggleCredentials())
1435
+ process.exit(1);
1436
+ }
1437
+ try {
1438
+ const results = await kaggleSource.discover(query, limit);
1439
+ console.log(formatSearchResults(results));
1440
+ }
1441
+ catch (error) {
1442
+ const msg = String(error?.message || error);
1443
+ if (msg.toLowerCase().includes("kaggle package not installed")) {
1444
+ console.error("Kaggle support is optional and needs the official client: pip install kaggle");
1445
+ }
1446
+ else {
1447
+ console.error(`Kaggle discover failed: ${msg}`);
1448
+ }
1449
+ process.exit(1);
1450
+ }
1451
+ return;
1452
+ }
1453
+ const hf = new HuggingFaceScraper();
1454
+ const results = await hf.scrape(limit, true, query);
1455
+ console.log(formatSearchResults(results));
1456
+ }
1457
+ async function runDownloadCli(args) {
1458
+ // Usage: vespermcp download kaggle user/dataset-name [--target-dir C:/path]
1459
+ const targetIdx = args.findIndex(a => a === "--target-dir");
1460
+ const targetDir = targetIdx >= 0 && targetIdx + 1 < args.length ? args[targetIdx + 1] : undefined;
1461
+ const nonFlags = args.filter((a, i) => {
1462
+ if (a.startsWith("--"))
1463
+ return false;
1464
+ if (targetIdx >= 0 && i === targetIdx + 1)
1465
+ return false;
1466
+ return true;
1467
+ });
1468
+ const source = (nonFlags[1] || "").toLowerCase();
1469
+ const datasetId = nonFlags[2] || "";
1470
+ if (!source || !datasetId) {
1471
+ console.error("Usage: vespermcp download kaggle <username/dataset-name> [--target-dir C:/path]");
1472
+ process.exit(1);
1473
+ }
1474
+ if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
1475
+ console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
1476
+ if (process.stdin.isTTY) {
1477
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1478
+ const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
1479
+ rl.close();
1480
+ if (answer.trim().toLowerCase() === "y") {
1481
+ await runConfigCli(["config", "kaggle"]);
1482
+ }
1483
+ }
1484
+ if (!dataIngestor.hasKaggleCredentials())
1485
+ process.exit(1);
1486
+ }
1487
+ let localPath = "";
1488
+ try {
1489
+ if (source === "kaggle" && targetDir) {
1490
+ const normalized = datasetId.includes("kaggle.com/datasets/")
1491
+ ? datasetId.split("kaggle.com/datasets/")[1].replace(/^\//, "")
1492
+ : datasetId;
1493
+ const dl = await kaggleSource.download(normalized, targetDir);
1494
+ localPath = dl.local_path;
1495
+ const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
1496
+ metadataStore.registerDownload(normalized, localPath, "completed", size);
1497
+ }
1498
+ else {
1499
+ localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
1500
+ }
1501
+ }
1502
+ catch (error) {
1503
+ const msg = String(error?.message || error);
1504
+ if (source === "kaggle" && msg.toLowerCase().includes("kaggle package not installed")) {
1505
+ console.error("Kaggle support is optional and needs the official client: pip install kaggle");
1506
+ }
1507
+ else {
1508
+ console.error(`Download failed: ${msg}`);
1509
+ }
1510
+ process.exit(1);
1511
+ }
1512
+ console.log(`✅ Download complete: ${localPath}`);
1513
+ }
1514
+ async function runFuseCli(args) {
1515
+ const getArgValue = (name) => {
1516
+ const idx = args.findIndex(a => a === name);
1517
+ if (idx >= 0 && idx + 1 < args.length)
1518
+ return args[idx + 1];
1519
+ return undefined;
1520
+ };
1521
+ const collectListAfter = (name) => {
1522
+ const idx = args.findIndex(a => a === name);
1523
+ if (idx < 0)
1524
+ return [];
1525
+ const out = [];
1526
+ for (let i = idx + 1; i < args.length; i++) {
1527
+ if (args[i].startsWith("--"))
1528
+ break;
1529
+ out.push(args[i]);
1530
+ }
1531
+ return out;
1532
+ };
1533
+ const sources = collectListAfter("--sources");
1534
+ if (sources.length < 2) {
1535
+ console.error("Usage: vespermcp fuse --sources <file1> <file2> [more] --strategy concat|join [--on id] [--how inner|left|outer] [--dedup] [--quality] [--leakage] [--format feather|parquet|csv|jsonl|arrow]");
1536
+ process.exit(1);
1537
+ }
1538
+ const strategy = getArgValue("--strategy") || "concat";
1539
+ const onValue = getArgValue("--on");
1540
+ const joinOn = onValue ? onValue.split(",").map(s => s.trim()).filter(Boolean) : undefined;
1541
+ const how = getArgValue("--how") || "inner";
1542
+ const outputFormat = getArgValue("--format") || "feather";
1543
+ const compression = getArgValue("--compression");
1544
+ const outputPath = getArgValue("--output") || path.join(process.cwd(), `fused_${Date.now()}.${outputFormat === "arrow" ? "arrow" : outputFormat}`);
1545
+ const dedup = args.includes("--dedup");
1546
+ const runQualityAfter = args.includes("--quality");
1547
+ const leakageCheck = args.includes("--leakage");
1548
+ const preview = !args.includes("--no-preview");
1549
+ const result = await fusionEngine.fuse(sources, outputPath, {
1550
+ strategy,
1551
+ join_on: joinOn,
1552
+ how,
1553
+ dedup,
1554
+ run_quality_after: runQualityAfter,
1555
+ leakage_check: leakageCheck,
1556
+ output_format: outputFormat,
1557
+ compression,
1558
+ preview,
1559
+ });
1560
+ const nullDelta = result.stats.null_delta;
1561
+ const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
1562
+ console.log(`Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).`);
1563
+ console.log(`Null increase: ${nullText}`);
1564
+ console.log(`Output: ${result.output_path}`);
1565
+ if (result.preview_path)
1566
+ console.log(`Preview saved: ${result.preview_path}`);
1567
+ console.log("Next: run vespermcp split/export on the fused dataset");
1568
+ }
781
1569
  async function runSetupWizard(silent = false) {
782
1570
  const configManager = new ConfigManager();
783
1571
  if (!silent) {