@vespermcp/mcp-server 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/fusion/engine.js +69 -0
- package/build/index.js +813 -25
- package/build/ingestion/hf-downloader.js +34 -5
- package/build/ingestion/ingestor.js +33 -9
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/scraper.js +34 -10
- package/build/python/config.py +259 -0
- package/build/python/export_engine.py +148 -52
- package/build/python/fusion_engine.py +368 -0
- package/build/python/kaggle_engine.py +204 -0
- package/build/python/row_count.py +54 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/scripts/build-index.js +5 -5
- package/build/search/jit-orchestrator.js +74 -14
- package/package.json +8 -2
- package/scripts/refresh-index.cjs +87 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/config.py +259 -0
- package/src/python/export_engine.py +148 -52
- package/src/python/fusion_engine.py +368 -0
- package/src/python/kaggle_engine.py +204 -0
- package/src/python/row_count.py +54 -0
- package/src/python/test_fusion_engine.py +89 -0
package/build/index.js
CHANGED
|
@@ -5,10 +5,13 @@ import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } f
|
|
|
5
5
|
import { fileURLToPath } from "url";
|
|
6
6
|
import path from "path";
|
|
7
7
|
import fs from "fs";
|
|
8
|
+
import { spawn } from "child_process";
|
|
8
9
|
import { MetadataStore } from "./metadata/store.js";
|
|
9
10
|
import { VectorStore } from "./search/vector-store.js";
|
|
10
11
|
import { Embedder } from "./search/embedder.js";
|
|
11
12
|
import { SearchEngine } from "./search/engine.js";
|
|
13
|
+
import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
14
|
+
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
12
15
|
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
13
16
|
import { JobManager } from "./jobs/manager.js";
|
|
14
17
|
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
@@ -17,6 +20,7 @@ import { DataCleaner } from "./cleaning/cleaner.js";
|
|
|
17
20
|
import { PipelineExecutor } from "./cleaning/executor.js";
|
|
18
21
|
import { DataSplitter } from "./splitting/splitter.js";
|
|
19
22
|
import { DataExporter } from "./export/exporter.js";
|
|
23
|
+
import { DataFusionEngine } from "./fusion/engine.js";
|
|
20
24
|
import { DataIngestor } from "./ingestion/ingestor.js";
|
|
21
25
|
import { InstallService } from "./install/install-service.js";
|
|
22
26
|
import { CacheService, MockRedisProvider } from "./cache/service.js";
|
|
@@ -24,6 +28,8 @@ import { ImageAnalyzer } from "./quality/image-analyzer.js";
|
|
|
24
28
|
import { MediaAnalyzer } from "./quality/media-analyzer.js";
|
|
25
29
|
import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
26
30
|
import { ConfigManager } from "./config/config-manager.js";
|
|
31
|
+
import { SecureKeysManager } from "./config/secure-keys.js";
|
|
32
|
+
import readline from "readline";
|
|
27
33
|
import os from "os";
|
|
28
34
|
// Determine absolute paths relative to the compiled script
|
|
29
35
|
const __filename = fileURLToPath(import.meta.url);
|
|
@@ -49,6 +55,50 @@ function logError(err, context) {
|
|
|
49
55
|
fs.appendFileSync(errorLogPath, msg);
|
|
50
56
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
51
57
|
}
|
|
58
|
+
function extractRequestedRows(query, requirements) {
|
|
59
|
+
const text = `${query || ""} ${requirements || ""}`.toLowerCase();
|
|
60
|
+
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
61
|
+
if (explicit) {
|
|
62
|
+
const n = Number(explicit[1].replace(/[\s,]/g, ""));
|
|
63
|
+
if (Number.isFinite(n) && n > 0)
|
|
64
|
+
return n;
|
|
65
|
+
}
|
|
66
|
+
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
67
|
+
.map(m => Number(m[0]))
|
|
68
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
69
|
+
if (allNums.length > 0)
|
|
70
|
+
return Math.max(...allNums);
|
|
71
|
+
return undefined;
|
|
72
|
+
}
|
|
73
|
+
function runPythonJson(scriptPath, args) {
|
|
74
|
+
const pyCmd = process.platform === "win32" ? "py" : "python";
|
|
75
|
+
return new Promise((resolve, reject) => {
|
|
76
|
+
const proc = spawn(pyCmd, [scriptPath, ...args]);
|
|
77
|
+
let stdout = "";
|
|
78
|
+
let stderr = "";
|
|
79
|
+
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
80
|
+
proc.stderr.on("data", (d) => (stderr += d.toString()));
|
|
81
|
+
proc.on("close", (code) => {
|
|
82
|
+
if (code !== 0) {
|
|
83
|
+
reject(new Error(stderr || stdout || `Python exited with ${code}`));
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
try {
|
|
87
|
+
resolve(JSON.parse(stdout));
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
reject(new Error(`Invalid JSON from python helper: ${stdout}`));
|
|
91
|
+
}
|
|
92
|
+
});
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
async function countRows(filePath) {
|
|
96
|
+
const scriptPath = path.join(dataRoot, "python", "row_count.py");
|
|
97
|
+
const result = await runPythonJson(scriptPath, [filePath]);
|
|
98
|
+
if (!result.ok)
|
|
99
|
+
throw new Error(result.error || "Failed to count rows");
|
|
100
|
+
return Number(result.rows || 0);
|
|
101
|
+
}
|
|
52
102
|
/**
|
|
53
103
|
* Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
|
|
54
104
|
*/
|
|
@@ -105,6 +155,21 @@ const dataCleaner = new DataCleaner(__dirname);
|
|
|
105
155
|
const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
|
|
106
156
|
const dataSplitter = new DataSplitter(__dirname);
|
|
107
157
|
const dataExporter = new DataExporter(__dirname);
|
|
158
|
+
const fusionEngine = new DataFusionEngine(__dirname);
|
|
159
|
+
const kaggleSource = new KaggleSource(__dirname);
|
|
160
|
+
const secureKeys = new SecureKeysManager(__dirname);
|
|
161
|
+
function hydrateExternalKeys() {
|
|
162
|
+
const keys = secureKeys.getAll();
|
|
163
|
+
if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
|
|
164
|
+
process.env.HF_TOKEN = String(keys.hf_token);
|
|
165
|
+
}
|
|
166
|
+
if (!process.env.KAGGLE_USERNAME && keys.kaggle_username) {
|
|
167
|
+
process.env.KAGGLE_USERNAME = String(keys.kaggle_username);
|
|
168
|
+
}
|
|
169
|
+
if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
|
|
170
|
+
process.env.KAGGLE_KEY = String(keys.kaggle_key);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
108
173
|
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
109
174
|
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
110
175
|
// NOT relative to project root (appRoot)
|
|
@@ -136,7 +201,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
136
201
|
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
137
202
|
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
138
203
|
switch (job.type) {
|
|
139
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query);
|
|
204
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
|
|
140
205
|
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
141
206
|
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
142
207
|
}
|
|
@@ -154,8 +219,9 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
154
219
|
/**
|
|
155
220
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
156
221
|
*/
|
|
157
|
-
async function handlePrepareJob(jobId, query) {
|
|
222
|
+
async function handlePrepareJob(jobId, query, requirements) {
|
|
158
223
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
224
|
+
const requestedRows = extractRequestedRows(query, requirements);
|
|
159
225
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
160
226
|
const results = await searchEngine.search(query, { limit: 1 });
|
|
161
227
|
if (results.length === 0) {
|
|
@@ -176,9 +242,59 @@ async function handlePrepareJob(jobId, query) {
|
|
|
176
242
|
}
|
|
177
243
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
178
244
|
// ensureData handles download and returns path to the raw file
|
|
179
|
-
|
|
245
|
+
let rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
|
|
180
246
|
update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
|
|
181
247
|
});
|
|
248
|
+
if (requestedRows && requestedRows > 0) {
|
|
249
|
+
update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
|
|
250
|
+
let currentRows = await countRows(rawFilePath);
|
|
251
|
+
if (currentRows < requestedRows) {
|
|
252
|
+
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
253
|
+
const additional = await searchEngine.search(query, { limit: 8 });
|
|
254
|
+
const sourceFiles = [rawFilePath];
|
|
255
|
+
let totalRows = currentRows;
|
|
256
|
+
for (const ds of additional) {
|
|
257
|
+
if (ds.id === topDataset.id)
|
|
258
|
+
continue;
|
|
259
|
+
try {
|
|
260
|
+
const dsSource = ds.source;
|
|
261
|
+
if (dsSource === "kaggle" && !dataIngestor.hasKaggleCredentials())
|
|
262
|
+
continue;
|
|
263
|
+
const p = await dataIngestor.ensureData(ds.id, dsSource, () => undefined);
|
|
264
|
+
const r = await countRows(p);
|
|
265
|
+
if (r <= 0)
|
|
266
|
+
continue;
|
|
267
|
+
sourceFiles.push(p);
|
|
268
|
+
totalRows += r;
|
|
269
|
+
if (totalRows >= requestedRows)
|
|
270
|
+
break;
|
|
271
|
+
}
|
|
272
|
+
catch {
|
|
273
|
+
// ignore candidate failures and continue trying
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
if (sourceFiles.length > 1) {
|
|
277
|
+
update({ progress: 67, status_text: `Fusing ${sourceFiles.length} datasets to meet row target...` });
|
|
278
|
+
const fusedPath = path.join(dataRoot, "fusion", `prepare_fused_${Date.now()}.feather`);
|
|
279
|
+
const fusionResult = await fusionEngine.fuse(sourceFiles, fusedPath, {
|
|
280
|
+
strategy: "concat",
|
|
281
|
+
dedup: true,
|
|
282
|
+
run_quality_after: false,
|
|
283
|
+
leakage_check: false,
|
|
284
|
+
output_format: "feather",
|
|
285
|
+
compression: "lz4",
|
|
286
|
+
preview: true,
|
|
287
|
+
});
|
|
288
|
+
rawFilePath = fusionResult.output_path;
|
|
289
|
+
currentRows = await countRows(rawFilePath);
|
|
290
|
+
}
|
|
291
|
+
if (currentRows < requestedRows) {
|
|
292
|
+
throw new Error(`Requested ${requestedRows.toLocaleString()} samples, but only ${currentRows.toLocaleString()} available across current matches. ` +
|
|
293
|
+
`Try broader query or enable additional sources.`);
|
|
294
|
+
}
|
|
295
|
+
update({ progress: 69, status_text: `✅ Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
296
|
+
}
|
|
297
|
+
}
|
|
182
298
|
update({ progress: 70, status_text: "Analyzing dataset quality..." });
|
|
183
299
|
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
184
300
|
// Update local metadata with quality info
|
|
@@ -234,10 +350,84 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
234
350
|
type: "string",
|
|
235
351
|
description: "The search query. Use -term to exclude keywords.",
|
|
236
352
|
},
|
|
353
|
+
enable_jit: {
|
|
354
|
+
type: "boolean",
|
|
355
|
+
description: "Enable live JIT search when local library results are insufficient (default: false).",
|
|
356
|
+
},
|
|
357
|
+
},
|
|
358
|
+
required: ["query"],
|
|
359
|
+
},
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
name: "discover_datasets",
|
|
363
|
+
description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
|
|
364
|
+
inputSchema: {
|
|
365
|
+
type: "object",
|
|
366
|
+
properties: {
|
|
367
|
+
query: {
|
|
368
|
+
type: "string",
|
|
369
|
+
description: "Search query, e.g. 'credit risk'.",
|
|
370
|
+
},
|
|
371
|
+
source: {
|
|
372
|
+
type: "string",
|
|
373
|
+
enum: ["huggingface", "kaggle"],
|
|
374
|
+
description: "Data source to discover from.",
|
|
375
|
+
},
|
|
376
|
+
limit: {
|
|
377
|
+
type: "number",
|
|
378
|
+
description: "Max results to return (default: 10).",
|
|
379
|
+
},
|
|
237
380
|
},
|
|
238
381
|
required: ["query"],
|
|
239
382
|
},
|
|
240
383
|
},
|
|
384
|
+
{
|
|
385
|
+
name: "download_dataset",
|
|
386
|
+
description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
|
|
387
|
+
inputSchema: {
|
|
388
|
+
type: "object",
|
|
389
|
+
properties: {
|
|
390
|
+
source: {
|
|
391
|
+
type: "string",
|
|
392
|
+
enum: ["huggingface", "kaggle"],
|
|
393
|
+
description: "Dataset source.",
|
|
394
|
+
},
|
|
395
|
+
dataset_id: {
|
|
396
|
+
type: "string",
|
|
397
|
+
description: "Dataset ID/slug (e.g. user/dataset for Kaggle or HF).",
|
|
398
|
+
},
|
|
399
|
+
target_dir: {
|
|
400
|
+
type: "string",
|
|
401
|
+
description: "Optional target directory for downloaded files.",
|
|
402
|
+
}
|
|
403
|
+
},
|
|
404
|
+
required: ["source", "dataset_id"],
|
|
405
|
+
},
|
|
406
|
+
},
|
|
407
|
+
{
|
|
408
|
+
name: "configure_kaggle",
|
|
409
|
+
description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
|
|
410
|
+
inputSchema: {
|
|
411
|
+
type: "object",
|
|
412
|
+
properties: {
|
|
413
|
+
username: { type: "string", description: "Kaggle username" },
|
|
414
|
+
key: { type: "string", description: "Kaggle API key" }
|
|
415
|
+
},
|
|
416
|
+
required: ["username", "key"],
|
|
417
|
+
},
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
name: "configure_keys",
|
|
421
|
+
description: "One-time optional key setup for external sources (Kaggle + gated HF). Core tools do not require keys.",
|
|
422
|
+
inputSchema: {
|
|
423
|
+
type: "object",
|
|
424
|
+
properties: {
|
|
425
|
+
hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
|
|
426
|
+
kaggle_username: { type: "string", description: "Optional Kaggle username" },
|
|
427
|
+
kaggle_key: { type: "string", description: "Optional Kaggle API key" }
|
|
428
|
+
},
|
|
429
|
+
},
|
|
430
|
+
},
|
|
241
431
|
{
|
|
242
432
|
name: "get_dataset_info",
|
|
243
433
|
description: "Get detailed metadata for a specific dataset by its ID. Returns comprehensive information including license, safety flags, and data characteristics.",
|
|
@@ -346,7 +536,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
346
536
|
},
|
|
347
537
|
{
|
|
348
538
|
name: "export_dataset",
|
|
349
|
-
description: "Export
|
|
539
|
+
description: "Export a dataset to a local directory. Use format='feather' (default) for 5-10× faster writes than CSV. Add fast=true to skip quality/cleaning steps.",
|
|
350
540
|
inputSchema: {
|
|
351
541
|
type: "object",
|
|
352
542
|
properties: {
|
|
@@ -360,13 +550,93 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
360
550
|
},
|
|
361
551
|
format: {
|
|
362
552
|
type: "string",
|
|
363
|
-
enum: ["
|
|
364
|
-
description: "
|
|
553
|
+
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
554
|
+
description: "Output format. feather (fastest), parquet (best compression), csv (human-readable). Default: feather.",
|
|
555
|
+
},
|
|
556
|
+
compression: {
|
|
557
|
+
type: "string",
|
|
558
|
+
enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
|
|
559
|
+
description: "Compression algorithm. Default: lz4 for feather, snappy for parquet, none for csv.",
|
|
560
|
+
},
|
|
561
|
+
fast: {
|
|
562
|
+
type: "boolean",
|
|
563
|
+
description: "Skip quality analysis and cleaning – raw export only. Much faster. Default: false.",
|
|
564
|
+
},
|
|
565
|
+
preview: {
|
|
566
|
+
type: "boolean",
|
|
567
|
+
description: "Generate a small 500-row CSV preview alongside binary exports. Default: false.",
|
|
568
|
+
},
|
|
569
|
+
sample_rows: {
|
|
570
|
+
type: "number",
|
|
571
|
+
description: "Export only this many random rows (faster for huge datasets).",
|
|
572
|
+
},
|
|
573
|
+
columns: {
|
|
574
|
+
type: "array",
|
|
575
|
+
items: { type: "string" },
|
|
576
|
+
description: "Export only these columns (faster for wide datasets).",
|
|
365
577
|
},
|
|
366
578
|
},
|
|
367
579
|
required: ["dataset_id"],
|
|
368
580
|
},
|
|
369
581
|
},
|
|
582
|
+
{
|
|
583
|
+
name: "fuse_datasets",
|
|
584
|
+
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
585
|
+
inputSchema: {
|
|
586
|
+
type: "object",
|
|
587
|
+
properties: {
|
|
588
|
+
sources: {
|
|
589
|
+
type: "array",
|
|
590
|
+
items: { type: "string" },
|
|
591
|
+
description: "List of dataset IDs and/or local file paths to fuse.",
|
|
592
|
+
},
|
|
593
|
+
strategy: {
|
|
594
|
+
type: "string",
|
|
595
|
+
enum: ["concat", "join"],
|
|
596
|
+
description: "Fusion strategy. concat appends rows; join merges on key(s).",
|
|
597
|
+
},
|
|
598
|
+
join_on: {
|
|
599
|
+
oneOf: [
|
|
600
|
+
{ type: "string" },
|
|
601
|
+
{ type: "array", items: { type: "string" } }
|
|
602
|
+
],
|
|
603
|
+
description: "Join key(s). Required when strategy='join'.",
|
|
604
|
+
},
|
|
605
|
+
how: {
|
|
606
|
+
type: "string",
|
|
607
|
+
enum: ["inner", "left", "outer"],
|
|
608
|
+
description: "Join mode (only for strategy='join').",
|
|
609
|
+
},
|
|
610
|
+
dedup: {
|
|
611
|
+
type: "boolean",
|
|
612
|
+
description: "Drop exact duplicate rows after fusion.",
|
|
613
|
+
},
|
|
614
|
+
run_quality_after: {
|
|
615
|
+
type: "boolean",
|
|
616
|
+
description: "Run quality analysis on the fused output.",
|
|
617
|
+
},
|
|
618
|
+
leakage_check: {
|
|
619
|
+
type: "boolean",
|
|
620
|
+
description: "Run leakage/overlap checks across fused sources.",
|
|
621
|
+
},
|
|
622
|
+
output_format: {
|
|
623
|
+
type: "string",
|
|
624
|
+
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
625
|
+
description: "Output format (default: feather).",
|
|
626
|
+
},
|
|
627
|
+
compression: {
|
|
628
|
+
type: "string",
|
|
629
|
+
enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
|
|
630
|
+
description: "Compression algorithm for binary outputs.",
|
|
631
|
+
},
|
|
632
|
+
preview: {
|
|
633
|
+
type: "boolean",
|
|
634
|
+
description: "Generate a small preview CSV of fused output.",
|
|
635
|
+
},
|
|
636
|
+
},
|
|
637
|
+
required: ["sources"],
|
|
638
|
+
},
|
|
639
|
+
},
|
|
370
640
|
{
|
|
371
641
|
name: "analyze_image_quality",
|
|
372
642
|
description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
|
|
@@ -423,10 +693,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
423
693
|
const query = String(request.params.arguments?.query);
|
|
424
694
|
const limit = 5;
|
|
425
695
|
const safeOnly = true; // Enable safe filter by default
|
|
696
|
+
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
426
697
|
if (!query) {
|
|
427
698
|
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
428
699
|
}
|
|
429
|
-
const results = await searchEngine.search(query, { limit, safeOnly });
|
|
700
|
+
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
430
701
|
const formattedOutput = formatSearchResults(results);
|
|
431
702
|
return {
|
|
432
703
|
content: [
|
|
@@ -437,6 +708,123 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
437
708
|
],
|
|
438
709
|
};
|
|
439
710
|
}
|
|
711
|
+
case "discover_datasets": {
|
|
712
|
+
hydrateExternalKeys();
|
|
713
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
714
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
715
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
716
|
+
if (!query) {
|
|
717
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
718
|
+
}
|
|
719
|
+
try {
|
|
720
|
+
let results = [];
|
|
721
|
+
if (source === "kaggle") {
|
|
722
|
+
if (!dataIngestor.hasKaggleCredentials()) {
|
|
723
|
+
return {
|
|
724
|
+
content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
|
|
725
|
+
isError: true,
|
|
726
|
+
};
|
|
727
|
+
}
|
|
728
|
+
results = await kaggleSource.discover(query, limit);
|
|
729
|
+
}
|
|
730
|
+
else {
|
|
731
|
+
const hf = new HuggingFaceScraper();
|
|
732
|
+
results = await hf.scrape(Math.max(1, limit), true, query);
|
|
733
|
+
}
|
|
734
|
+
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
735
|
+
return {
|
|
736
|
+
content: [{ type: "text", text: formattedOutput }]
|
|
737
|
+
};
|
|
738
|
+
}
|
|
739
|
+
catch (error) {
|
|
740
|
+
return {
|
|
741
|
+
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
742
|
+
isError: true,
|
|
743
|
+
};
|
|
744
|
+
}
|
|
745
|
+
}
|
|
746
|
+
case "download_dataset": {
|
|
747
|
+
hydrateExternalKeys();
|
|
748
|
+
const source = String(request.params.arguments?.source || "").toLowerCase();
|
|
749
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
750
|
+
if (!source || !datasetId) {
|
|
751
|
+
throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
|
|
752
|
+
}
|
|
753
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
754
|
+
return {
|
|
755
|
+
content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
|
|
756
|
+
isError: true,
|
|
757
|
+
};
|
|
758
|
+
}
|
|
759
|
+
try {
|
|
760
|
+
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
761
|
+
return {
|
|
762
|
+
content: [{ type: "text", text: `✅ Download complete: ${localPath}` }]
|
|
763
|
+
};
|
|
764
|
+
}
|
|
765
|
+
catch (error) {
|
|
766
|
+
return {
|
|
767
|
+
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
768
|
+
isError: true,
|
|
769
|
+
};
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
case "configure_kaggle": {
|
|
773
|
+
const username = String(request.params.arguments?.username || "").trim();
|
|
774
|
+
const key = String(request.params.arguments?.key || "").trim();
|
|
775
|
+
if (!username || !key) {
|
|
776
|
+
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
777
|
+
}
|
|
778
|
+
const r1 = secureKeys.set("kaggle_username", username);
|
|
779
|
+
const r2 = secureKeys.set("kaggle_key", key);
|
|
780
|
+
process.env.KAGGLE_USERNAME = username;
|
|
781
|
+
process.env.KAGGLE_KEY = key;
|
|
782
|
+
return {
|
|
783
|
+
content: [{ type: "text", text: `✅ Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
784
|
+
};
|
|
785
|
+
}
|
|
786
|
+
case "configure_keys": {
|
|
787
|
+
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
788
|
+
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
789
|
+
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
790
|
+
const saved = [];
|
|
791
|
+
const methods = [];
|
|
792
|
+
if (hfToken) {
|
|
793
|
+
const r = secureKeys.set("hf_token", hfToken);
|
|
794
|
+
if (r.ok) {
|
|
795
|
+
process.env.HF_TOKEN = hfToken;
|
|
796
|
+
saved.push("HF token");
|
|
797
|
+
if (r.method)
|
|
798
|
+
methods.push(r.method);
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
if (kaggleUsername) {
|
|
802
|
+
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
803
|
+
if (r.ok) {
|
|
804
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
805
|
+
saved.push("Kaggle username");
|
|
806
|
+
if (r.method)
|
|
807
|
+
methods.push(r.method);
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
if (kaggleKey) {
|
|
811
|
+
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
812
|
+
if (r.ok) {
|
|
813
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
814
|
+
saved.push("Kaggle key");
|
|
815
|
+
if (r.method)
|
|
816
|
+
methods.push(r.method);
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
if (saved.length === 0) {
|
|
820
|
+
return {
|
|
821
|
+
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
return {
|
|
825
|
+
content: [{ type: "text", text: `✅ Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
826
|
+
};
|
|
827
|
+
}
|
|
440
828
|
case "get_dataset_info": {
|
|
441
829
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
442
830
|
if (!datasetId) {
|
|
@@ -546,7 +934,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
546
934
|
}
|
|
547
935
|
case "prepare_dataset": {
|
|
548
936
|
const query = String(request.params.arguments?.query);
|
|
549
|
-
const
|
|
937
|
+
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
938
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements });
|
|
550
939
|
return {
|
|
551
940
|
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
552
941
|
};
|
|
@@ -577,7 +966,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
577
966
|
case "export_dataset": {
|
|
578
967
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
579
968
|
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
|
|
580
|
-
const requestedFormat = request.params.arguments?.format || "
|
|
969
|
+
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
970
|
+
const fastMode = request.params.arguments?.fast === true;
|
|
971
|
+
const preview = request.params.arguments?.preview === true;
|
|
972
|
+
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
973
|
+
const columns = request.params.arguments?.columns;
|
|
974
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
581
975
|
const dataset = metadataStore.getDataset(datasetId);
|
|
582
976
|
if (!dataset) {
|
|
583
977
|
throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
|
|
@@ -591,30 +985,153 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
591
985
|
};
|
|
592
986
|
}
|
|
593
987
|
let sourcePath = downloadStatus.local_path;
|
|
594
|
-
//
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
988
|
+
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
989
|
+
if (!fastMode) {
|
|
990
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
991
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "csv";
|
|
992
|
+
if (currentExt !== pipelineFmt) {
|
|
993
|
+
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
994
|
+
try {
|
|
995
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
996
|
+
if (pipelineResult.final_output_path) {
|
|
997
|
+
sourcePath = pipelineResult.final_output_path;
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
catch (err) {
|
|
1001
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
1002
|
+
}
|
|
607
1003
|
}
|
|
608
1004
|
}
|
|
1005
|
+
else {
|
|
1006
|
+
console.error(`[Export] ⚡ Fast mode – skipping quality analysis and cleaning`);
|
|
1007
|
+
}
|
|
1008
|
+
// Build export options
|
|
1009
|
+
const exportOpts = {};
|
|
1010
|
+
if (compression)
|
|
1011
|
+
exportOpts.compression = compression;
|
|
1012
|
+
if (preview)
|
|
1013
|
+
exportOpts.preview = true;
|
|
1014
|
+
if (sampleRows)
|
|
1015
|
+
exportOpts.sample_rows = sampleRows;
|
|
1016
|
+
if (columns)
|
|
1017
|
+
exportOpts.columns = columns;
|
|
609
1018
|
try {
|
|
610
|
-
|
|
1019
|
+
// Determine output file name
|
|
1020
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
1021
|
+
const ext = extMap[requestedFormat] || ".feather";
|
|
1022
|
+
const safeName = datasetId.replace(/\//g, "_");
|
|
1023
|
+
const outDir = targetDir || path.join(dataRoot, "exports");
|
|
1024
|
+
if (!fs.existsSync(outDir))
|
|
1025
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
1026
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
1027
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
1028
|
+
// Build rich response
|
|
1029
|
+
let msg = `✅ **Export complete**\n`;
|
|
1030
|
+
msg += `- **File**: ${result.output_path}\n`;
|
|
1031
|
+
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
1032
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
1033
|
+
if (result.file_size_mb !== undefined)
|
|
1034
|
+
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
1035
|
+
if (result.elapsed_seconds !== undefined)
|
|
1036
|
+
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
1037
|
+
if (result.preview_path)
|
|
1038
|
+
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
1039
|
+
msg += `\n`;
|
|
1040
|
+
if (requestedFormat === "feather") {
|
|
1041
|
+
msg += `💡 **Inspect with:**\n`;
|
|
1042
|
+
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
1043
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1044
|
+
}
|
|
1045
|
+
else if (requestedFormat === "parquet") {
|
|
1046
|
+
msg += `💡 **Inspect with:**\n`;
|
|
1047
|
+
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
1048
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1049
|
+
}
|
|
1050
|
+
return { content: [{ type: "text", text: msg }] };
|
|
1051
|
+
}
|
|
1052
|
+
catch (error) {
|
|
611
1053
|
return {
|
|
612
|
-
content: [{ type: "text", text:
|
|
1054
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
1055
|
+
isError: true
|
|
613
1056
|
};
|
|
614
1057
|
}
|
|
1058
|
+
}
|
|
1059
|
+
case "fuse_datasets": {
|
|
1060
|
+
const rawSources = request.params.arguments?.sources;
|
|
1061
|
+
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
1062
|
+
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
1063
|
+
}
|
|
1064
|
+
const strategy = request.params.arguments?.strategy || "concat";
|
|
1065
|
+
const joinOn = request.params.arguments?.join_on;
|
|
1066
|
+
const how = request.params.arguments?.how || "inner";
|
|
1067
|
+
const dedup = request.params.arguments?.dedup !== false;
|
|
1068
|
+
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
1069
|
+
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
1070
|
+
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
1071
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
1072
|
+
const preview = request.params.arguments?.preview !== false;
|
|
1073
|
+
const resolvedPaths = [];
|
|
1074
|
+
const unresolved = [];
|
|
1075
|
+
for (const src of rawSources) {
|
|
1076
|
+
if (fs.existsSync(src)) {
|
|
1077
|
+
resolvedPaths.push(src);
|
|
1078
|
+
continue;
|
|
1079
|
+
}
|
|
1080
|
+
const status = metadataStore.getDownloadStatus(src);
|
|
1081
|
+
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
1082
|
+
resolvedPaths.push(status.local_path);
|
|
1083
|
+
continue;
|
|
1084
|
+
}
|
|
1085
|
+
unresolved.push(src);
|
|
1086
|
+
}
|
|
1087
|
+
if (unresolved.length > 0) {
|
|
1088
|
+
return {
|
|
1089
|
+
content: [{
|
|
1090
|
+
type: "text",
|
|
1091
|
+
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
1092
|
+
}],
|
|
1093
|
+
isError: true
|
|
1094
|
+
};
|
|
1095
|
+
}
|
|
1096
|
+
try {
|
|
1097
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
1098
|
+
const ext = extMap[outputFormat] || ".feather";
|
|
1099
|
+
const outDir = path.join(dataRoot, "fusion");
|
|
1100
|
+
if (!fs.existsSync(outDir))
|
|
1101
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
1102
|
+
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
1103
|
+
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
1104
|
+
strategy,
|
|
1105
|
+
join_on: joinOn,
|
|
1106
|
+
how,
|
|
1107
|
+
dedup,
|
|
1108
|
+
run_quality_after: runQualityAfter,
|
|
1109
|
+
leakage_check: leakageCheck,
|
|
1110
|
+
output_format: outputFormat,
|
|
1111
|
+
compression: compression,
|
|
1112
|
+
preview,
|
|
1113
|
+
});
|
|
1114
|
+
const nullDelta = result.stats.null_delta;
|
|
1115
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1116
|
+
let msg = `✅ Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
1117
|
+
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
1118
|
+
msg += `- Null change: ${nullText}\n`;
|
|
1119
|
+
msg += `- Output: ${result.output_path}\n`;
|
|
1120
|
+
if (result.preview_path)
|
|
1121
|
+
msg += `- Preview: ${result.preview_path}\n`;
|
|
1122
|
+
if (result.leakage_report) {
|
|
1123
|
+
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
1124
|
+
if (result.leakage_report.leakage_count) {
|
|
1125
|
+
msg += ` (${result.leakage_report.leakage_count})`;
|
|
1126
|
+
}
|
|
1127
|
+
msg += "\n";
|
|
1128
|
+
}
|
|
1129
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output.`;
|
|
1130
|
+
return { content: [{ type: "text", text: msg }] };
|
|
1131
|
+
}
|
|
615
1132
|
catch (error) {
|
|
616
1133
|
return {
|
|
617
|
-
content: [{ type: "text", text: `ERROR:
|
|
1134
|
+
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
618
1135
|
isError: true
|
|
619
1136
|
};
|
|
620
1137
|
}
|
|
@@ -761,8 +1278,29 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
761
1278
|
});
|
|
762
1279
|
async function main() {
|
|
763
1280
|
const args = process.argv.slice(2);
|
|
1281
|
+
hydrateExternalKeys();
|
|
1282
|
+
const isFuse = args.includes("fuse");
|
|
1283
|
+
const isDiscover = args.includes("discover");
|
|
1284
|
+
const isDownload = args.includes("download");
|
|
1285
|
+
const isConfig = args.includes("config");
|
|
764
1286
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
765
1287
|
const isSilent = args.includes("--silent");
|
|
1288
|
+
if (isFuse) {
|
|
1289
|
+
await runFuseCli(args);
|
|
1290
|
+
return;
|
|
1291
|
+
}
|
|
1292
|
+
if (isConfig) {
|
|
1293
|
+
await runConfigCli(args);
|
|
1294
|
+
return;
|
|
1295
|
+
}
|
|
1296
|
+
if (isDiscover) {
|
|
1297
|
+
await runDiscoverCli(args);
|
|
1298
|
+
return;
|
|
1299
|
+
}
|
|
1300
|
+
if (isDownload) {
|
|
1301
|
+
await runDownloadCli(args);
|
|
1302
|
+
return;
|
|
1303
|
+
}
|
|
766
1304
|
// If run in setup mode OR in a terminal without args (human call), show setup wizard
|
|
767
1305
|
if (isSetup || (process.stdin.isTTY && args.length === 0)) {
|
|
768
1306
|
await runSetupWizard(isSilent);
|
|
@@ -778,6 +1316,256 @@ async function main() {
|
|
|
778
1316
|
console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
|
|
779
1317
|
console.log("[Vesper] Main loop finished");
|
|
780
1318
|
}
|
|
1319
|
+
async function runConfigCli(args) {
|
|
1320
|
+
const isKeys = args.includes("keys");
|
|
1321
|
+
const isKaggle = args.includes("kaggle");
|
|
1322
|
+
if (!(isKeys || isKaggle) || args.includes("--help")) {
|
|
1323
|
+
console.log("Usage: vespermcp config keys");
|
|
1324
|
+
console.log(" vespermcp config kaggle --username <name> --key <api_key>");
|
|
1325
|
+
console.log("Core Vesper tools work with zero API keys.");
|
|
1326
|
+
return;
|
|
1327
|
+
}
|
|
1328
|
+
const getArgValue = (name) => {
|
|
1329
|
+
const idx = args.findIndex(a => a === name);
|
|
1330
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
1331
|
+
return args[idx + 1];
|
|
1332
|
+
return undefined;
|
|
1333
|
+
};
|
|
1334
|
+
if (isKeys) {
|
|
1335
|
+
console.log("\n🔐 Vesper Optional Keys Setup");
|
|
1336
|
+
console.log("(Press Enter to skip any field)\n");
|
|
1337
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1338
|
+
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
1339
|
+
const current = secureKeys.getAll();
|
|
1340
|
+
const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
|
|
1341
|
+
const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
|
|
1342
|
+
const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
|
|
1343
|
+
rl.close();
|
|
1344
|
+
const saved = [];
|
|
1345
|
+
if (hfToken) {
|
|
1346
|
+
const res = secureKeys.set("hf_token", hfToken);
|
|
1347
|
+
if (res.ok) {
|
|
1348
|
+
process.env.HF_TOKEN = hfToken;
|
|
1349
|
+
saved.push("HF token");
|
|
1350
|
+
}
|
|
1351
|
+
}
|
|
1352
|
+
if (kaggleUsername) {
|
|
1353
|
+
const res = secureKeys.set("kaggle_username", kaggleUsername);
|
|
1354
|
+
if (res.ok) {
|
|
1355
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1356
|
+
saved.push("Kaggle username");
|
|
1357
|
+
}
|
|
1358
|
+
}
|
|
1359
|
+
if (kaggleKey) {
|
|
1360
|
+
const res = secureKeys.set("kaggle_key", kaggleKey);
|
|
1361
|
+
if (res.ok) {
|
|
1362
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
1363
|
+
saved.push("Kaggle key");
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
if (saved.length === 0) {
|
|
1367
|
+
console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
|
|
1368
|
+
return;
|
|
1369
|
+
}
|
|
1370
|
+
console.log(`✅ Key(s) saved securely: ${saved.join(", ")}`);
|
|
1371
|
+
console.log("You can now use Kaggle and gated Hugging Face datasets.");
|
|
1372
|
+
return;
|
|
1373
|
+
}
|
|
1374
|
+
// Backward-compatible Kaggle-specific path
|
|
1375
|
+
let username = getArgValue("--username") || "";
|
|
1376
|
+
let key = getArgValue("--key") || "";
|
|
1377
|
+
if (!username || !key) {
|
|
1378
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1379
|
+
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
1380
|
+
if (!username)
|
|
1381
|
+
username = (await ask("Kaggle username: ")).trim();
|
|
1382
|
+
if (!key)
|
|
1383
|
+
key = (await ask("Kaggle key: ")).trim();
|
|
1384
|
+
rl.close();
|
|
1385
|
+
}
|
|
1386
|
+
if (!username || !key) {
|
|
1387
|
+
console.error("Missing Kaggle username/key. Aborting.");
|
|
1388
|
+
process.exit(1);
|
|
1389
|
+
}
|
|
1390
|
+
secureKeys.set("kaggle_username", username);
|
|
1391
|
+
secureKeys.set("kaggle_key", key);
|
|
1392
|
+
process.env.KAGGLE_USERNAME = username;
|
|
1393
|
+
process.env.KAGGLE_KEY = key;
|
|
1394
|
+
console.log("✅ Key saved securely. You can now use Kaggle datasets.");
|
|
1395
|
+
}
|
|
1396
|
+
async function runDiscoverCli(args) {
|
|
1397
|
+
const getArgValue = (name) => {
|
|
1398
|
+
const idx = args.findIndex(a => a === name);
|
|
1399
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
1400
|
+
return args[idx + 1];
|
|
1401
|
+
return undefined;
|
|
1402
|
+
};
|
|
1403
|
+
const source = (getArgValue("--source") || "huggingface").toLowerCase();
|
|
1404
|
+
const limit = Number(getArgValue("--limit") || "10");
|
|
1405
|
+
const queryParts = [];
|
|
1406
|
+
for (let i = 0; i < args.length; i++) {
|
|
1407
|
+
const token = args[i];
|
|
1408
|
+
if (token === "discover")
|
|
1409
|
+
continue;
|
|
1410
|
+
if (token === "--source" || token === "--limit") {
|
|
1411
|
+
i += 1;
|
|
1412
|
+
continue;
|
|
1413
|
+
}
|
|
1414
|
+
if (token.startsWith("--"))
|
|
1415
|
+
continue;
|
|
1416
|
+
queryParts.push(token);
|
|
1417
|
+
}
|
|
1418
|
+
const query = queryParts.join(" ").trim();
|
|
1419
|
+
if (!query) {
|
|
1420
|
+
console.error("Usage: vespermcp discover --source kaggle \"credit risk\" --limit 10");
|
|
1421
|
+
process.exit(1);
|
|
1422
|
+
}
|
|
1423
|
+
if (source === "kaggle") {
|
|
1424
|
+
if (!dataIngestor.hasKaggleCredentials()) {
|
|
1425
|
+
console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
|
|
1426
|
+
if (process.stdin.isTTY) {
|
|
1427
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1428
|
+
const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
|
|
1429
|
+
rl.close();
|
|
1430
|
+
if (answer.trim().toLowerCase() === "y") {
|
|
1431
|
+
await runConfigCli(["config", "kaggle"]);
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
if (!dataIngestor.hasKaggleCredentials())
|
|
1435
|
+
process.exit(1);
|
|
1436
|
+
}
|
|
1437
|
+
try {
|
|
1438
|
+
const results = await kaggleSource.discover(query, limit);
|
|
1439
|
+
console.log(formatSearchResults(results));
|
|
1440
|
+
}
|
|
1441
|
+
catch (error) {
|
|
1442
|
+
const msg = String(error?.message || error);
|
|
1443
|
+
if (msg.toLowerCase().includes("kaggle package not installed")) {
|
|
1444
|
+
console.error("Kaggle support is optional and needs the official client: pip install kaggle");
|
|
1445
|
+
}
|
|
1446
|
+
else {
|
|
1447
|
+
console.error(`Kaggle discover failed: ${msg}`);
|
|
1448
|
+
}
|
|
1449
|
+
process.exit(1);
|
|
1450
|
+
}
|
|
1451
|
+
return;
|
|
1452
|
+
}
|
|
1453
|
+
const hf = new HuggingFaceScraper();
|
|
1454
|
+
const results = await hf.scrape(limit, true, query);
|
|
1455
|
+
console.log(formatSearchResults(results));
|
|
1456
|
+
}
|
|
1457
|
+
async function runDownloadCli(args) {
|
|
1458
|
+
// Usage: vespermcp download kaggle user/dataset-name [--target-dir C:/path]
|
|
1459
|
+
const targetIdx = args.findIndex(a => a === "--target-dir");
|
|
1460
|
+
const targetDir = targetIdx >= 0 && targetIdx + 1 < args.length ? args[targetIdx + 1] : undefined;
|
|
1461
|
+
const nonFlags = args.filter((a, i) => {
|
|
1462
|
+
if (a.startsWith("--"))
|
|
1463
|
+
return false;
|
|
1464
|
+
if (targetIdx >= 0 && i === targetIdx + 1)
|
|
1465
|
+
return false;
|
|
1466
|
+
return true;
|
|
1467
|
+
});
|
|
1468
|
+
const source = (nonFlags[1] || "").toLowerCase();
|
|
1469
|
+
const datasetId = nonFlags[2] || "";
|
|
1470
|
+
if (!source || !datasetId) {
|
|
1471
|
+
console.error("Usage: vespermcp download kaggle <username/dataset-name> [--target-dir C:/path]");
|
|
1472
|
+
process.exit(1);
|
|
1473
|
+
}
|
|
1474
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1475
|
+
console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
|
|
1476
|
+
if (process.stdin.isTTY) {
|
|
1477
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1478
|
+
const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
|
|
1479
|
+
rl.close();
|
|
1480
|
+
if (answer.trim().toLowerCase() === "y") {
|
|
1481
|
+
await runConfigCli(["config", "kaggle"]);
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
if (!dataIngestor.hasKaggleCredentials())
|
|
1485
|
+
process.exit(1);
|
|
1486
|
+
}
|
|
1487
|
+
let localPath = "";
|
|
1488
|
+
try {
|
|
1489
|
+
if (source === "kaggle" && targetDir) {
|
|
1490
|
+
const normalized = datasetId.includes("kaggle.com/datasets/")
|
|
1491
|
+
? datasetId.split("kaggle.com/datasets/")[1].replace(/^\//, "")
|
|
1492
|
+
: datasetId;
|
|
1493
|
+
const dl = await kaggleSource.download(normalized, targetDir);
|
|
1494
|
+
localPath = dl.local_path;
|
|
1495
|
+
const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
|
|
1496
|
+
metadataStore.registerDownload(normalized, localPath, "completed", size);
|
|
1497
|
+
}
|
|
1498
|
+
else {
|
|
1499
|
+
localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
|
|
1500
|
+
}
|
|
1501
|
+
}
|
|
1502
|
+
catch (error) {
|
|
1503
|
+
const msg = String(error?.message || error);
|
|
1504
|
+
if (source === "kaggle" && msg.toLowerCase().includes("kaggle package not installed")) {
|
|
1505
|
+
console.error("Kaggle support is optional and needs the official client: pip install kaggle");
|
|
1506
|
+
}
|
|
1507
|
+
else {
|
|
1508
|
+
console.error(`Download failed: ${msg}`);
|
|
1509
|
+
}
|
|
1510
|
+
process.exit(1);
|
|
1511
|
+
}
|
|
1512
|
+
console.log(`✅ Download complete: ${localPath}`);
|
|
1513
|
+
}
|
|
1514
|
+
async function runFuseCli(args) {
|
|
1515
|
+
const getArgValue = (name) => {
|
|
1516
|
+
const idx = args.findIndex(a => a === name);
|
|
1517
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
1518
|
+
return args[idx + 1];
|
|
1519
|
+
return undefined;
|
|
1520
|
+
};
|
|
1521
|
+
const collectListAfter = (name) => {
|
|
1522
|
+
const idx = args.findIndex(a => a === name);
|
|
1523
|
+
if (idx < 0)
|
|
1524
|
+
return [];
|
|
1525
|
+
const out = [];
|
|
1526
|
+
for (let i = idx + 1; i < args.length; i++) {
|
|
1527
|
+
if (args[i].startsWith("--"))
|
|
1528
|
+
break;
|
|
1529
|
+
out.push(args[i]);
|
|
1530
|
+
}
|
|
1531
|
+
return out;
|
|
1532
|
+
};
|
|
1533
|
+
const sources = collectListAfter("--sources");
|
|
1534
|
+
if (sources.length < 2) {
|
|
1535
|
+
console.error("Usage: vespermcp fuse --sources <file1> <file2> [more] --strategy concat|join [--on id] [--how inner|left|outer] [--dedup] [--quality] [--leakage] [--format feather|parquet|csv|jsonl|arrow]");
|
|
1536
|
+
process.exit(1);
|
|
1537
|
+
}
|
|
1538
|
+
const strategy = getArgValue("--strategy") || "concat";
|
|
1539
|
+
const onValue = getArgValue("--on");
|
|
1540
|
+
const joinOn = onValue ? onValue.split(",").map(s => s.trim()).filter(Boolean) : undefined;
|
|
1541
|
+
const how = getArgValue("--how") || "inner";
|
|
1542
|
+
const outputFormat = getArgValue("--format") || "feather";
|
|
1543
|
+
const compression = getArgValue("--compression");
|
|
1544
|
+
const outputPath = getArgValue("--output") || path.join(process.cwd(), `fused_${Date.now()}.${outputFormat === "arrow" ? "arrow" : outputFormat}`);
|
|
1545
|
+
const dedup = args.includes("--dedup");
|
|
1546
|
+
const runQualityAfter = args.includes("--quality");
|
|
1547
|
+
const leakageCheck = args.includes("--leakage");
|
|
1548
|
+
const preview = !args.includes("--no-preview");
|
|
1549
|
+
const result = await fusionEngine.fuse(sources, outputPath, {
|
|
1550
|
+
strategy,
|
|
1551
|
+
join_on: joinOn,
|
|
1552
|
+
how,
|
|
1553
|
+
dedup,
|
|
1554
|
+
run_quality_after: runQualityAfter,
|
|
1555
|
+
leakage_check: leakageCheck,
|
|
1556
|
+
output_format: outputFormat,
|
|
1557
|
+
compression,
|
|
1558
|
+
preview,
|
|
1559
|
+
});
|
|
1560
|
+
const nullDelta = result.stats.null_delta;
|
|
1561
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1562
|
+
console.log(`Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).`);
|
|
1563
|
+
console.log(`Null increase: ${nullText}`);
|
|
1564
|
+
console.log(`Output: ${result.output_path}`);
|
|
1565
|
+
if (result.preview_path)
|
|
1566
|
+
console.log(`Preview saved: ${result.preview_path}`);
|
|
1567
|
+
console.log("Next: run vespermcp split/export on the fused dataset");
|
|
1568
|
+
}
|
|
781
1569
|
async function runSetupWizard(silent = false) {
|
|
782
1570
|
const configManager = new ConfigManager();
|
|
783
1571
|
if (!silent) {
|