@vespermcp/mcp-server 1.1.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/fusion/engine.js +69 -0
- package/build/index.js +900 -50
- package/build/ingestion/hf-downloader.js +12 -3
- package/build/ingestion/ingestor.js +33 -9
- package/build/ingestion/kaggle-downloader.js +2 -2
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/scraper.js +34 -10
- package/build/python/config.py +259 -0
- package/build/python/export_engine.py +148 -52
- package/build/python/fusion_engine.py +368 -0
- package/build/python/kaggle_engine.py +204 -0
- package/build/python/row_count.py +54 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/scripts/build-index.js +5 -5
- package/build/search/jit-orchestrator.js +72 -12
- package/build/tools/formatter.js +14 -14
- package/package.json +9 -3
- package/scripts/refresh-index.cjs +87 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/config.py +259 -0
- package/src/python/export_engine.py +148 -52
- package/src/python/fusion_engine.py +368 -0
- package/src/python/kaggle_engine.py +204 -0
- package/src/python/row_count.py +54 -0
- package/src/python/test_fusion_engine.py +89 -0
package/build/index.js
CHANGED
|
@@ -5,10 +5,13 @@ import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } f
|
|
|
5
5
|
import { fileURLToPath } from "url";
|
|
6
6
|
import path from "path";
|
|
7
7
|
import fs from "fs";
|
|
8
|
+
import { spawn } from "child_process";
|
|
8
9
|
import { MetadataStore } from "./metadata/store.js";
|
|
9
10
|
import { VectorStore } from "./search/vector-store.js";
|
|
10
11
|
import { Embedder } from "./search/embedder.js";
|
|
11
12
|
import { SearchEngine } from "./search/engine.js";
|
|
13
|
+
import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
14
|
+
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
12
15
|
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
13
16
|
import { JobManager } from "./jobs/manager.js";
|
|
14
17
|
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
@@ -17,6 +20,7 @@ import { DataCleaner } from "./cleaning/cleaner.js";
|
|
|
17
20
|
import { PipelineExecutor } from "./cleaning/executor.js";
|
|
18
21
|
import { DataSplitter } from "./splitting/splitter.js";
|
|
19
22
|
import { DataExporter } from "./export/exporter.js";
|
|
23
|
+
import { DataFusionEngine } from "./fusion/engine.js";
|
|
20
24
|
import { DataIngestor } from "./ingestion/ingestor.js";
|
|
21
25
|
import { InstallService } from "./install/install-service.js";
|
|
22
26
|
import { CacheService, MockRedisProvider } from "./cache/service.js";
|
|
@@ -24,6 +28,8 @@ import { ImageAnalyzer } from "./quality/image-analyzer.js";
|
|
|
24
28
|
import { MediaAnalyzer } from "./quality/media-analyzer.js";
|
|
25
29
|
import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
26
30
|
import { ConfigManager } from "./config/config-manager.js";
|
|
31
|
+
import { SecureKeysManager } from "./config/secure-keys.js";
|
|
32
|
+
import readline from "readline";
|
|
27
33
|
import os from "os";
|
|
28
34
|
// Determine absolute paths relative to the compiled script
|
|
29
35
|
const __filename = fileURLToPath(import.meta.url);
|
|
@@ -49,6 +55,109 @@ function logError(err, context) {
|
|
|
49
55
|
fs.appendFileSync(errorLogPath, msg);
|
|
50
56
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
51
57
|
}
|
|
58
|
+
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
59
|
+
function printLaunchScreen() {
|
|
60
|
+
const screen = `
|
|
61
|
+
══════════════════════════════════════════════
|
|
62
|
+
|
|
63
|
+
██ ██ ███████ ███████ ██████ ███████ ██████
|
|
64
|
+
██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
65
|
+
██ ██ █████ █████ ██████ █████ ██████
|
|
66
|
+
██ ██ ██ ██ ██ ██ ██ ██
|
|
67
|
+
████ ███████ ███████ ██ ███████ ██ ██
|
|
68
|
+
|
|
69
|
+
dataset intelligence layer
|
|
70
|
+
mcp-native • agent-first
|
|
71
|
+
|
|
72
|
+
══════════════════════════════════════════════
|
|
73
|
+
|
|
74
|
+
[ core ] initializing
|
|
75
|
+
[ splitting ] leakage-safe
|
|
76
|
+
[ quality ] multimodal scan
|
|
77
|
+
[ fusion ] guarded
|
|
78
|
+
[ synth ] generation ready
|
|
79
|
+
|
|
80
|
+
status: operational
|
|
81
|
+
`;
|
|
82
|
+
console.error(screen);
|
|
83
|
+
}
|
|
84
|
+
async function runWithSpinner(label, task) {
|
|
85
|
+
if (!process.stderr.isTTY) {
|
|
86
|
+
return task();
|
|
87
|
+
}
|
|
88
|
+
let frameIndex = 0;
|
|
89
|
+
let timer;
|
|
90
|
+
let spinnerShown = false;
|
|
91
|
+
const delayedStart = setTimeout(() => {
|
|
92
|
+
spinnerShown = true;
|
|
93
|
+
timer = setInterval(() => {
|
|
94
|
+
const frame = SPINNER_FRAMES[frameIndex % SPINNER_FRAMES.length];
|
|
95
|
+
frameIndex += 1;
|
|
96
|
+
process.stderr.write(`\r${frame} ${label}`);
|
|
97
|
+
}, 90);
|
|
98
|
+
}, 1000);
|
|
99
|
+
try {
|
|
100
|
+
const result = await task();
|
|
101
|
+
clearTimeout(delayedStart);
|
|
102
|
+
if (timer)
|
|
103
|
+
clearInterval(timer);
|
|
104
|
+
if (spinnerShown)
|
|
105
|
+
process.stderr.write(`\r[ok] ${label} \n`);
|
|
106
|
+
return result;
|
|
107
|
+
}
|
|
108
|
+
catch (error) {
|
|
109
|
+
clearTimeout(delayedStart);
|
|
110
|
+
if (timer)
|
|
111
|
+
clearInterval(timer);
|
|
112
|
+
if (spinnerShown)
|
|
113
|
+
process.stderr.write(`\r[error] ${label} \n`);
|
|
114
|
+
throw error;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
function extractRequestedRows(query, requirements) {
|
|
118
|
+
const text = `${query || ""} ${requirements || ""}`.toLowerCase();
|
|
119
|
+
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
120
|
+
if (explicit) {
|
|
121
|
+
const n = Number(explicit[1].replace(/[\s,]/g, ""));
|
|
122
|
+
if (Number.isFinite(n) && n > 0)
|
|
123
|
+
return n;
|
|
124
|
+
}
|
|
125
|
+
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
126
|
+
.map(m => Number(m[0]))
|
|
127
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
128
|
+
if (allNums.length > 0)
|
|
129
|
+
return Math.max(...allNums);
|
|
130
|
+
return undefined;
|
|
131
|
+
}
|
|
132
|
+
function runPythonJson(scriptPath, args) {
|
|
133
|
+
const pyCmd = process.platform === "win32" ? "py" : "python";
|
|
134
|
+
return new Promise((resolve, reject) => {
|
|
135
|
+
const proc = spawn(pyCmd, [scriptPath, ...args]);
|
|
136
|
+
let stdout = "";
|
|
137
|
+
let stderr = "";
|
|
138
|
+
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
139
|
+
proc.stderr.on("data", (d) => (stderr += d.toString()));
|
|
140
|
+
proc.on("close", (code) => {
|
|
141
|
+
if (code !== 0) {
|
|
142
|
+
reject(new Error(stderr || stdout || `Python exited with ${code}`));
|
|
143
|
+
return;
|
|
144
|
+
}
|
|
145
|
+
try {
|
|
146
|
+
resolve(JSON.parse(stdout));
|
|
147
|
+
}
|
|
148
|
+
catch {
|
|
149
|
+
reject(new Error(`Invalid JSON from python helper: ${stdout}`));
|
|
150
|
+
}
|
|
151
|
+
});
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
async function countRows(filePath) {
|
|
155
|
+
const scriptPath = path.join(dataRoot, "python", "row_count.py");
|
|
156
|
+
const result = await runPythonJson(scriptPath, [filePath]);
|
|
157
|
+
if (!result.ok)
|
|
158
|
+
throw new Error(result.error || "Failed to count rows");
|
|
159
|
+
return Number(result.rows || 0);
|
|
160
|
+
}
|
|
52
161
|
/**
|
|
53
162
|
* Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
|
|
54
163
|
*/
|
|
@@ -105,6 +214,21 @@ const dataCleaner = new DataCleaner(__dirname);
|
|
|
105
214
|
const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
|
|
106
215
|
const dataSplitter = new DataSplitter(__dirname);
|
|
107
216
|
const dataExporter = new DataExporter(__dirname);
|
|
217
|
+
const fusionEngine = new DataFusionEngine(__dirname);
|
|
218
|
+
const kaggleSource = new KaggleSource(__dirname);
|
|
219
|
+
const secureKeys = new SecureKeysManager(__dirname);
|
|
220
|
+
function hydrateExternalKeys() {
|
|
221
|
+
const keys = secureKeys.getAll();
|
|
222
|
+
if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
|
|
223
|
+
process.env.HF_TOKEN = String(keys.hf_token);
|
|
224
|
+
}
|
|
225
|
+
if (!process.env.KAGGLE_USERNAME && keys.kaggle_username) {
|
|
226
|
+
process.env.KAGGLE_USERNAME = String(keys.kaggle_username);
|
|
227
|
+
}
|
|
228
|
+
if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
|
|
229
|
+
process.env.KAGGLE_KEY = String(keys.kaggle_key);
|
|
230
|
+
}
|
|
231
|
+
}
|
|
108
232
|
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
109
233
|
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
110
234
|
// NOT relative to project root (appRoot)
|
|
@@ -117,11 +241,11 @@ const qualityOrchestrator = new QualityOrchestrator(__dirname);
|
|
|
117
241
|
// Subscribe to job updates for real-time streaming to the UI
|
|
118
242
|
jobManager.on("jobUpdated", (job) => {
|
|
119
243
|
const level = job.status === "failed" ? "error" : "info";
|
|
120
|
-
const
|
|
244
|
+
const statusTag = job.status === "completed" ? "done" : (job.status === "failed" ? "failed" : "running");
|
|
121
245
|
const progress = job.progress > 0 ? `[${job.progress}%]` : "";
|
|
122
246
|
server.sendLoggingMessage({
|
|
123
247
|
level,
|
|
124
|
-
data:
|
|
248
|
+
data: `[${statusTag}] [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
|
|
125
249
|
});
|
|
126
250
|
});
|
|
127
251
|
// IMPORTANT: Execute jobs when the manager emits them
|
|
@@ -136,7 +260,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
136
260
|
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
137
261
|
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
138
262
|
switch (job.type) {
|
|
139
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query);
|
|
263
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
|
|
140
264
|
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
141
265
|
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
142
266
|
}
|
|
@@ -154,8 +278,9 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
154
278
|
/**
|
|
155
279
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
156
280
|
*/
|
|
157
|
-
async function handlePrepareJob(jobId, query) {
|
|
281
|
+
async function handlePrepareJob(jobId, query, requirements) {
|
|
158
282
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
283
|
+
const requestedRows = extractRequestedRows(query, requirements);
|
|
159
284
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
160
285
|
const results = await searchEngine.search(query, { limit: 1 });
|
|
161
286
|
if (results.length === 0) {
|
|
@@ -176,9 +301,59 @@ async function handlePrepareJob(jobId, query) {
|
|
|
176
301
|
}
|
|
177
302
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
178
303
|
// ensureData handles download and returns path to the raw file
|
|
179
|
-
|
|
304
|
+
let rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
|
|
180
305
|
update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
|
|
181
306
|
});
|
|
307
|
+
if (requestedRows && requestedRows > 0) {
|
|
308
|
+
update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
|
|
309
|
+
let currentRows = await countRows(rawFilePath);
|
|
310
|
+
if (currentRows < requestedRows) {
|
|
311
|
+
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
312
|
+
const additional = await searchEngine.search(query, { limit: 8 });
|
|
313
|
+
const sourceFiles = [rawFilePath];
|
|
314
|
+
let totalRows = currentRows;
|
|
315
|
+
for (const ds of additional) {
|
|
316
|
+
if (ds.id === topDataset.id)
|
|
317
|
+
continue;
|
|
318
|
+
try {
|
|
319
|
+
const dsSource = ds.source;
|
|
320
|
+
if (dsSource === "kaggle" && !dataIngestor.hasKaggleCredentials())
|
|
321
|
+
continue;
|
|
322
|
+
const p = await dataIngestor.ensureData(ds.id, dsSource, () => undefined);
|
|
323
|
+
const r = await countRows(p);
|
|
324
|
+
if (r <= 0)
|
|
325
|
+
continue;
|
|
326
|
+
sourceFiles.push(p);
|
|
327
|
+
totalRows += r;
|
|
328
|
+
if (totalRows >= requestedRows)
|
|
329
|
+
break;
|
|
330
|
+
}
|
|
331
|
+
catch {
|
|
332
|
+
// ignore candidate failures and continue trying
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
if (sourceFiles.length > 1) {
|
|
336
|
+
update({ progress: 67, status_text: `Fusing ${sourceFiles.length} datasets to meet row target...` });
|
|
337
|
+
const fusedPath = path.join(dataRoot, "fusion", `prepare_fused_${Date.now()}.feather`);
|
|
338
|
+
const fusionResult = await fusionEngine.fuse(sourceFiles, fusedPath, {
|
|
339
|
+
strategy: "concat",
|
|
340
|
+
dedup: true,
|
|
341
|
+
run_quality_after: false,
|
|
342
|
+
leakage_check: false,
|
|
343
|
+
output_format: "feather",
|
|
344
|
+
compression: "lz4",
|
|
345
|
+
preview: true,
|
|
346
|
+
});
|
|
347
|
+
rawFilePath = fusionResult.output_path;
|
|
348
|
+
currentRows = await countRows(rawFilePath);
|
|
349
|
+
}
|
|
350
|
+
if (currentRows < requestedRows) {
|
|
351
|
+
throw new Error(`Requested ${requestedRows.toLocaleString()} samples, but only ${currentRows.toLocaleString()} available across current matches. ` +
|
|
352
|
+
`Try broader query or enable additional sources.`);
|
|
353
|
+
}
|
|
354
|
+
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
355
|
+
}
|
|
356
|
+
}
|
|
182
357
|
update({ progress: 70, status_text: "Analyzing dataset quality..." });
|
|
183
358
|
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
184
359
|
// Update local metadata with quality info
|
|
@@ -234,10 +409,84 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
234
409
|
type: "string",
|
|
235
410
|
description: "The search query. Use -term to exclude keywords.",
|
|
236
411
|
},
|
|
412
|
+
enable_jit: {
|
|
413
|
+
type: "boolean",
|
|
414
|
+
description: "Enable live JIT search when local library results are insufficient (default: false).",
|
|
415
|
+
},
|
|
416
|
+
},
|
|
417
|
+
required: ["query"],
|
|
418
|
+
},
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
name: "discover_datasets",
|
|
422
|
+
description: "Discover datasets from a specific source. Kaggle is optional and requires user-provided API key.",
|
|
423
|
+
inputSchema: {
|
|
424
|
+
type: "object",
|
|
425
|
+
properties: {
|
|
426
|
+
query: {
|
|
427
|
+
type: "string",
|
|
428
|
+
description: "Search query, e.g. 'credit risk'.",
|
|
429
|
+
},
|
|
430
|
+
source: {
|
|
431
|
+
type: "string",
|
|
432
|
+
enum: ["huggingface", "kaggle"],
|
|
433
|
+
description: "Data source to discover from.",
|
|
434
|
+
},
|
|
435
|
+
limit: {
|
|
436
|
+
type: "number",
|
|
437
|
+
description: "Max results to return (default: 10).",
|
|
438
|
+
},
|
|
237
439
|
},
|
|
238
440
|
required: ["query"],
|
|
239
441
|
},
|
|
240
442
|
},
|
|
443
|
+
{
|
|
444
|
+
name: "download_dataset",
|
|
445
|
+
description: "Download a dataset by source and ID/slug into local Vesper storage. Kaggle requires optional API key.",
|
|
446
|
+
inputSchema: {
|
|
447
|
+
type: "object",
|
|
448
|
+
properties: {
|
|
449
|
+
source: {
|
|
450
|
+
type: "string",
|
|
451
|
+
enum: ["huggingface", "kaggle"],
|
|
452
|
+
description: "Dataset source.",
|
|
453
|
+
},
|
|
454
|
+
dataset_id: {
|
|
455
|
+
type: "string",
|
|
456
|
+
description: "Dataset ID/slug (e.g. user/dataset for Kaggle or HF).",
|
|
457
|
+
},
|
|
458
|
+
target_dir: {
|
|
459
|
+
type: "string",
|
|
460
|
+
description: "Optional target directory for downloaded files.",
|
|
461
|
+
}
|
|
462
|
+
},
|
|
463
|
+
required: ["source", "dataset_id"],
|
|
464
|
+
},
|
|
465
|
+
},
|
|
466
|
+
{
|
|
467
|
+
name: "configure_kaggle",
|
|
468
|
+
description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
|
|
469
|
+
inputSchema: {
|
|
470
|
+
type: "object",
|
|
471
|
+
properties: {
|
|
472
|
+
username: { type: "string", description: "Kaggle username" },
|
|
473
|
+
key: { type: "string", description: "Kaggle API key" }
|
|
474
|
+
},
|
|
475
|
+
required: ["username", "key"],
|
|
476
|
+
},
|
|
477
|
+
},
|
|
478
|
+
{
|
|
479
|
+
name: "configure_keys",
|
|
480
|
+
description: "One-time optional key setup for external sources (Kaggle + gated HF). Core tools do not require keys.",
|
|
481
|
+
inputSchema: {
|
|
482
|
+
type: "object",
|
|
483
|
+
properties: {
|
|
484
|
+
hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
|
|
485
|
+
kaggle_username: { type: "string", description: "Optional Kaggle username" },
|
|
486
|
+
kaggle_key: { type: "string", description: "Optional Kaggle API key" }
|
|
487
|
+
},
|
|
488
|
+
},
|
|
489
|
+
},
|
|
241
490
|
{
|
|
242
491
|
name: "get_dataset_info",
|
|
243
492
|
description: "Get detailed metadata for a specific dataset by its ID. Returns comprehensive information including license, safety flags, and data characteristics.",
|
|
@@ -346,7 +595,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
346
595
|
},
|
|
347
596
|
{
|
|
348
597
|
name: "export_dataset",
|
|
349
|
-
description: "Export
|
|
598
|
+
description: "Export a dataset to a local directory. Use format='feather' (default) for 5-10× faster writes than CSV. Add fast=true to skip quality/cleaning steps.",
|
|
350
599
|
inputSchema: {
|
|
351
600
|
type: "object",
|
|
352
601
|
properties: {
|
|
@@ -360,13 +609,93 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
360
609
|
},
|
|
361
610
|
format: {
|
|
362
611
|
type: "string",
|
|
363
|
-
enum: ["
|
|
364
|
-
description: "
|
|
612
|
+
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
613
|
+
description: "Output format. feather (fastest), parquet (best compression), csv (human-readable). Default: feather.",
|
|
614
|
+
},
|
|
615
|
+
compression: {
|
|
616
|
+
type: "string",
|
|
617
|
+
enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
|
|
618
|
+
description: "Compression algorithm. Default: lz4 for feather, snappy for parquet, none for csv.",
|
|
619
|
+
},
|
|
620
|
+
fast: {
|
|
621
|
+
type: "boolean",
|
|
622
|
+
description: "Skip quality analysis and cleaning – raw export only. Much faster. Default: false.",
|
|
623
|
+
},
|
|
624
|
+
preview: {
|
|
625
|
+
type: "boolean",
|
|
626
|
+
description: "Generate a small 500-row CSV preview alongside binary exports. Default: false.",
|
|
627
|
+
},
|
|
628
|
+
sample_rows: {
|
|
629
|
+
type: "number",
|
|
630
|
+
description: "Export only this many random rows (faster for huge datasets).",
|
|
631
|
+
},
|
|
632
|
+
columns: {
|
|
633
|
+
type: "array",
|
|
634
|
+
items: { type: "string" },
|
|
635
|
+
description: "Export only these columns (faster for wide datasets).",
|
|
365
636
|
},
|
|
366
637
|
},
|
|
367
638
|
required: ["dataset_id"],
|
|
368
639
|
},
|
|
369
640
|
},
|
|
641
|
+
{
|
|
642
|
+
name: "fuse_datasets",
|
|
643
|
+
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
644
|
+
inputSchema: {
|
|
645
|
+
type: "object",
|
|
646
|
+
properties: {
|
|
647
|
+
sources: {
|
|
648
|
+
type: "array",
|
|
649
|
+
items: { type: "string" },
|
|
650
|
+
description: "List of dataset IDs and/or local file paths to fuse.",
|
|
651
|
+
},
|
|
652
|
+
strategy: {
|
|
653
|
+
type: "string",
|
|
654
|
+
enum: ["concat", "join"],
|
|
655
|
+
description: "Fusion strategy. concat appends rows; join merges on key(s).",
|
|
656
|
+
},
|
|
657
|
+
join_on: {
|
|
658
|
+
oneOf: [
|
|
659
|
+
{ type: "string" },
|
|
660
|
+
{ type: "array", items: { type: "string" } }
|
|
661
|
+
],
|
|
662
|
+
description: "Join key(s). Required when strategy='join'.",
|
|
663
|
+
},
|
|
664
|
+
how: {
|
|
665
|
+
type: "string",
|
|
666
|
+
enum: ["inner", "left", "outer"],
|
|
667
|
+
description: "Join mode (only for strategy='join').",
|
|
668
|
+
},
|
|
669
|
+
dedup: {
|
|
670
|
+
type: "boolean",
|
|
671
|
+
description: "Drop exact duplicate rows after fusion.",
|
|
672
|
+
},
|
|
673
|
+
run_quality_after: {
|
|
674
|
+
type: "boolean",
|
|
675
|
+
description: "Run quality analysis on the fused output.",
|
|
676
|
+
},
|
|
677
|
+
leakage_check: {
|
|
678
|
+
type: "boolean",
|
|
679
|
+
description: "Run leakage/overlap checks across fused sources.",
|
|
680
|
+
},
|
|
681
|
+
output_format: {
|
|
682
|
+
type: "string",
|
|
683
|
+
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
684
|
+
description: "Output format (default: feather).",
|
|
685
|
+
},
|
|
686
|
+
compression: {
|
|
687
|
+
type: "string",
|
|
688
|
+
enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
|
|
689
|
+
description: "Compression algorithm for binary outputs.",
|
|
690
|
+
},
|
|
691
|
+
preview: {
|
|
692
|
+
type: "boolean",
|
|
693
|
+
description: "Generate a small preview CSV of fused output.",
|
|
694
|
+
},
|
|
695
|
+
},
|
|
696
|
+
required: ["sources"],
|
|
697
|
+
},
|
|
698
|
+
},
|
|
370
699
|
{
|
|
371
700
|
name: "analyze_image_quality",
|
|
372
701
|
description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
|
|
@@ -423,10 +752,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
423
752
|
const query = String(request.params.arguments?.query);
|
|
424
753
|
const limit = 5;
|
|
425
754
|
const safeOnly = true; // Enable safe filter by default
|
|
755
|
+
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
426
756
|
if (!query) {
|
|
427
757
|
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
428
758
|
}
|
|
429
|
-
const results = await searchEngine.search(query, { limit, safeOnly });
|
|
759
|
+
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
430
760
|
const formattedOutput = formatSearchResults(results);
|
|
431
761
|
return {
|
|
432
762
|
content: [
|
|
@@ -437,6 +767,123 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
437
767
|
],
|
|
438
768
|
};
|
|
439
769
|
}
|
|
770
|
+
case "discover_datasets": {
|
|
771
|
+
hydrateExternalKeys();
|
|
772
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
773
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
774
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
775
|
+
if (!query) {
|
|
776
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
777
|
+
}
|
|
778
|
+
try {
|
|
779
|
+
let results = [];
|
|
780
|
+
if (source === "kaggle") {
|
|
781
|
+
if (!dataIngestor.hasKaggleCredentials()) {
|
|
782
|
+
return {
|
|
783
|
+
content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
|
|
784
|
+
isError: true,
|
|
785
|
+
};
|
|
786
|
+
}
|
|
787
|
+
results = await kaggleSource.discover(query, limit);
|
|
788
|
+
}
|
|
789
|
+
else {
|
|
790
|
+
const hf = new HuggingFaceScraper();
|
|
791
|
+
results = await hf.scrape(Math.max(1, limit), true, query);
|
|
792
|
+
}
|
|
793
|
+
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
794
|
+
return {
|
|
795
|
+
content: [{ type: "text", text: formattedOutput }]
|
|
796
|
+
};
|
|
797
|
+
}
|
|
798
|
+
catch (error) {
|
|
799
|
+
return {
|
|
800
|
+
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
801
|
+
isError: true,
|
|
802
|
+
};
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
case "download_dataset": {
|
|
806
|
+
hydrateExternalKeys();
|
|
807
|
+
const source = String(request.params.arguments?.source || "").toLowerCase();
|
|
808
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
809
|
+
if (!source || !datasetId) {
|
|
810
|
+
throw new McpError(ErrorCode.InvalidParams, "source and dataset_id are required");
|
|
811
|
+
}
|
|
812
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
813
|
+
return {
|
|
814
|
+
content: [{ type: "text", text: `Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).` }],
|
|
815
|
+
isError: true,
|
|
816
|
+
};
|
|
817
|
+
}
|
|
818
|
+
try {
|
|
819
|
+
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
820
|
+
return {
|
|
821
|
+
content: [{ type: "text", text: `Download complete: ${localPath}` }]
|
|
822
|
+
};
|
|
823
|
+
}
|
|
824
|
+
catch (error) {
|
|
825
|
+
return {
|
|
826
|
+
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
827
|
+
isError: true,
|
|
828
|
+
};
|
|
829
|
+
}
|
|
830
|
+
}
|
|
831
|
+
case "configure_kaggle": {
|
|
832
|
+
const username = String(request.params.arguments?.username || "").trim();
|
|
833
|
+
const key = String(request.params.arguments?.key || "").trim();
|
|
834
|
+
if (!username || !key) {
|
|
835
|
+
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
836
|
+
}
|
|
837
|
+
const r1 = secureKeys.set("kaggle_username", username);
|
|
838
|
+
const r2 = secureKeys.set("kaggle_key", key);
|
|
839
|
+
process.env.KAGGLE_USERNAME = username;
|
|
840
|
+
process.env.KAGGLE_KEY = key;
|
|
841
|
+
return {
|
|
842
|
+
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
843
|
+
};
|
|
844
|
+
}
|
|
845
|
+
case "configure_keys": {
|
|
846
|
+
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
847
|
+
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
848
|
+
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
849
|
+
const saved = [];
|
|
850
|
+
const methods = [];
|
|
851
|
+
if (hfToken) {
|
|
852
|
+
const r = secureKeys.set("hf_token", hfToken);
|
|
853
|
+
if (r.ok) {
|
|
854
|
+
process.env.HF_TOKEN = hfToken;
|
|
855
|
+
saved.push("HF token");
|
|
856
|
+
if (r.method)
|
|
857
|
+
methods.push(r.method);
|
|
858
|
+
}
|
|
859
|
+
}
|
|
860
|
+
if (kaggleUsername) {
|
|
861
|
+
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
862
|
+
if (r.ok) {
|
|
863
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
864
|
+
saved.push("Kaggle username");
|
|
865
|
+
if (r.method)
|
|
866
|
+
methods.push(r.method);
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
if (kaggleKey) {
|
|
870
|
+
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
871
|
+
if (r.ok) {
|
|
872
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
873
|
+
saved.push("Kaggle key");
|
|
874
|
+
if (r.method)
|
|
875
|
+
methods.push(r.method);
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
if (saved.length === 0) {
|
|
879
|
+
return {
|
|
880
|
+
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
881
|
+
};
|
|
882
|
+
}
|
|
883
|
+
return {
|
|
884
|
+
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
885
|
+
};
|
|
886
|
+
}
|
|
440
887
|
case "get_dataset_info": {
|
|
441
888
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
442
889
|
if (!datasetId) {
|
|
@@ -518,14 +965,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
518
965
|
confidence: targetResult.confidence
|
|
519
966
|
} : undefined;
|
|
520
967
|
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
521
|
-
let explanation = `###
|
|
968
|
+
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
522
969
|
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
523
|
-
explanation +=
|
|
970
|
+
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
524
971
|
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
525
972
|
}
|
|
526
973
|
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
527
974
|
if (plan.operations.length === 0) {
|
|
528
|
-
explanation += "
|
|
975
|
+
explanation += "No cleaning operations required.";
|
|
529
976
|
}
|
|
530
977
|
else {
|
|
531
978
|
plan.operations.forEach((op, i) => {
|
|
@@ -546,7 +993,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
546
993
|
}
|
|
547
994
|
case "prepare_dataset": {
|
|
548
995
|
const query = String(request.params.arguments?.query);
|
|
549
|
-
const
|
|
996
|
+
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
997
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements });
|
|
550
998
|
return {
|
|
551
999
|
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
552
1000
|
};
|
|
@@ -577,7 +1025,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
577
1025
|
case "export_dataset": {
|
|
578
1026
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
579
1027
|
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
|
|
580
|
-
const requestedFormat = request.params.arguments?.format || "
|
|
1028
|
+
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
1029
|
+
const fastMode = request.params.arguments?.fast === true;
|
|
1030
|
+
const preview = request.params.arguments?.preview === true;
|
|
1031
|
+
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
1032
|
+
const columns = request.params.arguments?.columns;
|
|
1033
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
581
1034
|
const dataset = metadataStore.getDataset(datasetId);
|
|
582
1035
|
if (!dataset) {
|
|
583
1036
|
throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
|
|
@@ -591,30 +1044,153 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
591
1044
|
};
|
|
592
1045
|
}
|
|
593
1046
|
let sourcePath = downloadStatus.local_path;
|
|
594
|
-
//
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
1047
|
+
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
1048
|
+
if (!fastMode) {
|
|
1049
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
1050
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "csv";
|
|
1051
|
+
if (currentExt !== pipelineFmt) {
|
|
1052
|
+
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
1053
|
+
try {
|
|
1054
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
1055
|
+
if (pipelineResult.final_output_path) {
|
|
1056
|
+
sourcePath = pipelineResult.final_output_path;
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
catch (err) {
|
|
1060
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
1061
|
+
}
|
|
607
1062
|
}
|
|
608
1063
|
}
|
|
1064
|
+
else {
|
|
1065
|
+
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
1066
|
+
}
|
|
1067
|
+
// Build export options
|
|
1068
|
+
const exportOpts = {};
|
|
1069
|
+
if (compression)
|
|
1070
|
+
exportOpts.compression = compression;
|
|
1071
|
+
if (preview)
|
|
1072
|
+
exportOpts.preview = true;
|
|
1073
|
+
if (sampleRows)
|
|
1074
|
+
exportOpts.sample_rows = sampleRows;
|
|
1075
|
+
if (columns)
|
|
1076
|
+
exportOpts.columns = columns;
|
|
609
1077
|
try {
|
|
610
|
-
|
|
1078
|
+
// Determine output file name
|
|
1079
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
1080
|
+
const ext = extMap[requestedFormat] || ".feather";
|
|
1081
|
+
const safeName = datasetId.replace(/\//g, "_");
|
|
1082
|
+
const outDir = targetDir || path.join(dataRoot, "exports");
|
|
1083
|
+
if (!fs.existsSync(outDir))
|
|
1084
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
1085
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
1086
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
1087
|
+
// Build rich response
|
|
1088
|
+
let msg = `**Export complete**\n`;
|
|
1089
|
+
msg += `- **File**: ${result.output_path}\n`;
|
|
1090
|
+
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
1091
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
1092
|
+
if (result.file_size_mb !== undefined)
|
|
1093
|
+
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
1094
|
+
if (result.elapsed_seconds !== undefined)
|
|
1095
|
+
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
1096
|
+
if (result.preview_path)
|
|
1097
|
+
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
1098
|
+
msg += `\n`;
|
|
1099
|
+
if (requestedFormat === "feather") {
|
|
1100
|
+
msg += `**Inspect with:**\n`;
|
|
1101
|
+
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
1102
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1103
|
+
}
|
|
1104
|
+
else if (requestedFormat === "parquet") {
|
|
1105
|
+
msg += `**Inspect with:**\n`;
|
|
1106
|
+
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
1107
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1108
|
+
}
|
|
1109
|
+
return { content: [{ type: "text", text: msg }] };
|
|
1110
|
+
}
|
|
1111
|
+
catch (error) {
|
|
1112
|
+
return {
|
|
1113
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
1114
|
+
isError: true
|
|
1115
|
+
};
|
|
1116
|
+
}
|
|
1117
|
+
}
|
|
1118
|
+
case "fuse_datasets": {
|
|
1119
|
+
const rawSources = request.params.arguments?.sources;
|
|
1120
|
+
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
1121
|
+
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
1122
|
+
}
|
|
1123
|
+
const strategy = request.params.arguments?.strategy || "concat";
|
|
1124
|
+
const joinOn = request.params.arguments?.join_on;
|
|
1125
|
+
const how = request.params.arguments?.how || "inner";
|
|
1126
|
+
const dedup = request.params.arguments?.dedup !== false;
|
|
1127
|
+
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
1128
|
+
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
1129
|
+
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
1130
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
1131
|
+
const preview = request.params.arguments?.preview !== false;
|
|
1132
|
+
const resolvedPaths = [];
|
|
1133
|
+
const unresolved = [];
|
|
1134
|
+
for (const src of rawSources) {
|
|
1135
|
+
if (fs.existsSync(src)) {
|
|
1136
|
+
resolvedPaths.push(src);
|
|
1137
|
+
continue;
|
|
1138
|
+
}
|
|
1139
|
+
const status = metadataStore.getDownloadStatus(src);
|
|
1140
|
+
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
1141
|
+
resolvedPaths.push(status.local_path);
|
|
1142
|
+
continue;
|
|
1143
|
+
}
|
|
1144
|
+
unresolved.push(src);
|
|
1145
|
+
}
|
|
1146
|
+
if (unresolved.length > 0) {
|
|
611
1147
|
return {
|
|
612
|
-
content: [{
|
|
1148
|
+
content: [{
|
|
1149
|
+
type: "text",
|
|
1150
|
+
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
1151
|
+
}],
|
|
1152
|
+
isError: true
|
|
613
1153
|
};
|
|
614
1154
|
}
|
|
1155
|
+
try {
|
|
1156
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
1157
|
+
const ext = extMap[outputFormat] || ".feather";
|
|
1158
|
+
const outDir = path.join(dataRoot, "fusion");
|
|
1159
|
+
if (!fs.existsSync(outDir))
|
|
1160
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
1161
|
+
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
1162
|
+
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
1163
|
+
strategy,
|
|
1164
|
+
join_on: joinOn,
|
|
1165
|
+
how,
|
|
1166
|
+
dedup,
|
|
1167
|
+
run_quality_after: runQualityAfter,
|
|
1168
|
+
leakage_check: leakageCheck,
|
|
1169
|
+
output_format: outputFormat,
|
|
1170
|
+
compression: compression,
|
|
1171
|
+
preview,
|
|
1172
|
+
});
|
|
1173
|
+
const nullDelta = result.stats.null_delta;
|
|
1174
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1175
|
+
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
1176
|
+
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
1177
|
+
msg += `- Null change: ${nullText}\n`;
|
|
1178
|
+
msg += `- Output: ${result.output_path}\n`;
|
|
1179
|
+
if (result.preview_path)
|
|
1180
|
+
msg += `- Preview: ${result.preview_path}\n`;
|
|
1181
|
+
if (result.leakage_report) {
|
|
1182
|
+
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
1183
|
+
if (result.leakage_report.leakage_count) {
|
|
1184
|
+
msg += ` (${result.leakage_report.leakage_count})`;
|
|
1185
|
+
}
|
|
1186
|
+
msg += "\n";
|
|
1187
|
+
}
|
|
1188
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output.`;
|
|
1189
|
+
return { content: [{ type: "text", text: msg }] };
|
|
1190
|
+
}
|
|
615
1191
|
catch (error) {
|
|
616
1192
|
return {
|
|
617
|
-
content: [{ type: "text", text: `ERROR:
|
|
1193
|
+
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
618
1194
|
isError: true
|
|
619
1195
|
};
|
|
620
1196
|
}
|
|
@@ -626,16 +1202,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
626
1202
|
}
|
|
627
1203
|
try {
|
|
628
1204
|
const report = await imageAnalyzer.analyze(inputPath);
|
|
629
|
-
let output = `##
|
|
1205
|
+
let output = `## Image Quality Report\n\n`;
|
|
630
1206
|
output += `- **Total Images**: ${report.total_images}\n`;
|
|
631
1207
|
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
632
1208
|
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
633
1209
|
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
634
1210
|
if (report.individual_results.length > 0) {
|
|
635
|
-
output += `###
|
|
1211
|
+
output += `### Sample Detail (Top 5)\n`;
|
|
636
1212
|
report.individual_results.slice(0, 5).forEach(img => {
|
|
637
|
-
const
|
|
638
|
-
output += `${
|
|
1213
|
+
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
1214
|
+
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
639
1215
|
});
|
|
640
1216
|
}
|
|
641
1217
|
return {
|
|
@@ -656,7 +1232,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
656
1232
|
}
|
|
657
1233
|
try {
|
|
658
1234
|
const report = await mediaAnalyzer.analyze(inputPath);
|
|
659
|
-
let output = `##
|
|
1235
|
+
let output = `## Media Quality Report\n\n`;
|
|
660
1236
|
output += `- **Total Files**: ${report.total_files}\n`;
|
|
661
1237
|
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
662
1238
|
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
@@ -667,17 +1243,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
667
1243
|
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
668
1244
|
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
669
1245
|
}
|
|
670
|
-
output += `\n###
|
|
1246
|
+
output += `\n### Sample Detail (Top 5)\n`;
|
|
671
1247
|
report.details.slice(0, 5).forEach(item => {
|
|
672
|
-
const
|
|
1248
|
+
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
673
1249
|
if (item.type === "audio" && 'sample_rate' in item) {
|
|
674
|
-
output += `${
|
|
1250
|
+
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
675
1251
|
}
|
|
676
1252
|
else if (item.type === "video" && 'width' in item) {
|
|
677
|
-
output += `${
|
|
1253
|
+
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
678
1254
|
}
|
|
679
1255
|
else {
|
|
680
|
-
output += `${
|
|
1256
|
+
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
681
1257
|
}
|
|
682
1258
|
});
|
|
683
1259
|
return {
|
|
@@ -708,39 +1284,39 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
708
1284
|
metadata.unified_quality_report = report;
|
|
709
1285
|
await metadataStore.saveDataset(metadata);
|
|
710
1286
|
}
|
|
711
|
-
let output = `#
|
|
1287
|
+
let output = `# Unified Quality Report\n\n`;
|
|
712
1288
|
output += `**Dataset**: ${datasetId}\n`;
|
|
713
1289
|
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
714
1290
|
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
715
1291
|
if (report.text_quality) {
|
|
716
|
-
output += `##
|
|
1292
|
+
output += `## Text Quality\n`;
|
|
717
1293
|
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
718
1294
|
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
719
1295
|
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
720
1296
|
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
721
1297
|
}
|
|
722
1298
|
if (report.image_quality) {
|
|
723
|
-
output += `##
|
|
1299
|
+
output += `## Image Quality\n`;
|
|
724
1300
|
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
725
1301
|
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
726
1302
|
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
727
1303
|
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
728
1304
|
}
|
|
729
1305
|
if (report.audio_quality) {
|
|
730
|
-
output += `##
|
|
1306
|
+
output += `## Audio Quality\n`;
|
|
731
1307
|
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
732
1308
|
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
733
1309
|
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
734
1310
|
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
735
1311
|
}
|
|
736
1312
|
if (report.video_quality) {
|
|
737
|
-
output += `##
|
|
1313
|
+
output += `## Video Quality\n`;
|
|
738
1314
|
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
739
1315
|
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
740
1316
|
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
741
1317
|
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
742
1318
|
}
|
|
743
|
-
output += `##
|
|
1319
|
+
output += `## Recommendations\n`;
|
|
744
1320
|
report.recommendations.forEach(rec => {
|
|
745
1321
|
output += `- ${rec}\n`;
|
|
746
1322
|
});
|
|
@@ -761,8 +1337,32 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
761
1337
|
});
|
|
762
1338
|
async function main() {
|
|
763
1339
|
const args = process.argv.slice(2);
|
|
1340
|
+
hydrateExternalKeys();
|
|
1341
|
+
const isFuse = args.includes("fuse");
|
|
1342
|
+
const isDiscover = args.includes("discover");
|
|
1343
|
+
const isDownload = args.includes("download");
|
|
1344
|
+
const isConfig = args.includes("config");
|
|
764
1345
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
765
1346
|
const isSilent = args.includes("--silent");
|
|
1347
|
+
if (process.stdin.isTTY && !isSilent) {
|
|
1348
|
+
printLaunchScreen();
|
|
1349
|
+
}
|
|
1350
|
+
if (isFuse) {
|
|
1351
|
+
await runFuseCli(args);
|
|
1352
|
+
return;
|
|
1353
|
+
}
|
|
1354
|
+
if (isConfig) {
|
|
1355
|
+
await runConfigCli(args);
|
|
1356
|
+
return;
|
|
1357
|
+
}
|
|
1358
|
+
if (isDiscover) {
|
|
1359
|
+
await runDiscoverCli(args);
|
|
1360
|
+
return;
|
|
1361
|
+
}
|
|
1362
|
+
if (isDownload) {
|
|
1363
|
+
await runDownloadCli(args);
|
|
1364
|
+
return;
|
|
1365
|
+
}
|
|
766
1366
|
// If run in setup mode OR in a terminal without args (human call), show setup wizard
|
|
767
1367
|
if (isSetup || (process.stdin.isTTY && args.length === 0)) {
|
|
768
1368
|
await runSetupWizard(isSilent);
|
|
@@ -778,24 +1378,274 @@ async function main() {
|
|
|
778
1378
|
console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
|
|
779
1379
|
console.log("[Vesper] Main loop finished");
|
|
780
1380
|
}
|
|
1381
|
+
async function runConfigCli(args) {
|
|
1382
|
+
const isKeys = args.includes("keys");
|
|
1383
|
+
const isKaggle = args.includes("kaggle");
|
|
1384
|
+
if (!(isKeys || isKaggle) || args.includes("--help")) {
|
|
1385
|
+
console.log("Usage: vespermcp config keys");
|
|
1386
|
+
console.log(" vespermcp config kaggle --username <name> --key <api_key>");
|
|
1387
|
+
console.log("Core Vesper tools work with zero API keys.");
|
|
1388
|
+
return;
|
|
1389
|
+
}
|
|
1390
|
+
const getArgValue = (name) => {
|
|
1391
|
+
const idx = args.findIndex(a => a === name);
|
|
1392
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
1393
|
+
return args[idx + 1];
|
|
1394
|
+
return undefined;
|
|
1395
|
+
};
|
|
1396
|
+
if (isKeys) {
|
|
1397
|
+
console.log("\nVesper Optional Keys Setup");
|
|
1398
|
+
console.log("(Press Enter to skip any field)\n");
|
|
1399
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1400
|
+
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
1401
|
+
const current = secureKeys.getAll();
|
|
1402
|
+
const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
|
|
1403
|
+
const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
|
|
1404
|
+
const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
|
|
1405
|
+
rl.close();
|
|
1406
|
+
const saved = [];
|
|
1407
|
+
if (hfToken) {
|
|
1408
|
+
const res = secureKeys.set("hf_token", hfToken);
|
|
1409
|
+
if (res.ok) {
|
|
1410
|
+
process.env.HF_TOKEN = hfToken;
|
|
1411
|
+
saved.push("HF token");
|
|
1412
|
+
}
|
|
1413
|
+
}
|
|
1414
|
+
if (kaggleUsername) {
|
|
1415
|
+
const res = secureKeys.set("kaggle_username", kaggleUsername);
|
|
1416
|
+
if (res.ok) {
|
|
1417
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1418
|
+
saved.push("Kaggle username");
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
if (kaggleKey) {
|
|
1422
|
+
const res = secureKeys.set("kaggle_key", kaggleKey);
|
|
1423
|
+
if (res.ok) {
|
|
1424
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
1425
|
+
saved.push("Kaggle key");
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
if (saved.length === 0) {
|
|
1429
|
+
console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
|
|
1430
|
+
return;
|
|
1431
|
+
}
|
|
1432
|
+
console.log(`Key(s) saved securely: ${saved.join(", ")}`);
|
|
1433
|
+
console.log("You can now use Kaggle and gated Hugging Face datasets.");
|
|
1434
|
+
return;
|
|
1435
|
+
}
|
|
1436
|
+
// Backward-compatible Kaggle-specific path
|
|
1437
|
+
let username = getArgValue("--username") || "";
|
|
1438
|
+
let key = getArgValue("--key") || "";
|
|
1439
|
+
if (!username || !key) {
|
|
1440
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1441
|
+
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
1442
|
+
if (!username)
|
|
1443
|
+
username = (await ask("Kaggle username: ")).trim();
|
|
1444
|
+
if (!key)
|
|
1445
|
+
key = (await ask("Kaggle key: ")).trim();
|
|
1446
|
+
rl.close();
|
|
1447
|
+
}
|
|
1448
|
+
if (!username || !key) {
|
|
1449
|
+
console.error("Missing Kaggle username/key. Aborting.");
|
|
1450
|
+
process.exit(1);
|
|
1451
|
+
}
|
|
1452
|
+
secureKeys.set("kaggle_username", username);
|
|
1453
|
+
secureKeys.set("kaggle_key", key);
|
|
1454
|
+
process.env.KAGGLE_USERNAME = username;
|
|
1455
|
+
process.env.KAGGLE_KEY = key;
|
|
1456
|
+
console.log("Key saved securely. You can now use Kaggle datasets.");
|
|
1457
|
+
}
|
|
1458
|
+
async function runDiscoverCli(args) {
|
|
1459
|
+
const getArgValue = (name) => {
|
|
1460
|
+
const idx = args.findIndex(a => a === name);
|
|
1461
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
1462
|
+
return args[idx + 1];
|
|
1463
|
+
return undefined;
|
|
1464
|
+
};
|
|
1465
|
+
const source = (getArgValue("--source") || "huggingface").toLowerCase();
|
|
1466
|
+
const limit = Number(getArgValue("--limit") || "10");
|
|
1467
|
+
const queryParts = [];
|
|
1468
|
+
for (let i = 0; i < args.length; i++) {
|
|
1469
|
+
const token = args[i];
|
|
1470
|
+
if (token === "discover")
|
|
1471
|
+
continue;
|
|
1472
|
+
if (token === "--source" || token === "--limit") {
|
|
1473
|
+
i += 1;
|
|
1474
|
+
continue;
|
|
1475
|
+
}
|
|
1476
|
+
if (token.startsWith("--"))
|
|
1477
|
+
continue;
|
|
1478
|
+
queryParts.push(token);
|
|
1479
|
+
}
|
|
1480
|
+
const query = queryParts.join(" ").trim();
|
|
1481
|
+
if (!query) {
|
|
1482
|
+
console.error("Usage: vespermcp discover --source kaggle \"credit risk\" --limit 10");
|
|
1483
|
+
process.exit(1);
|
|
1484
|
+
}
|
|
1485
|
+
if (source === "kaggle") {
|
|
1486
|
+
if (!dataIngestor.hasKaggleCredentials()) {
|
|
1487
|
+
console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
|
|
1488
|
+
if (process.stdin.isTTY) {
|
|
1489
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1490
|
+
const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
|
|
1491
|
+
rl.close();
|
|
1492
|
+
if (answer.trim().toLowerCase() === "y") {
|
|
1493
|
+
await runConfigCli(["config", "kaggle"]);
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1496
|
+
if (!dataIngestor.hasKaggleCredentials())
|
|
1497
|
+
process.exit(1);
|
|
1498
|
+
}
|
|
1499
|
+
try {
|
|
1500
|
+
const results = await kaggleSource.discover(query, limit);
|
|
1501
|
+
console.log(formatSearchResults(results));
|
|
1502
|
+
}
|
|
1503
|
+
catch (error) {
|
|
1504
|
+
const msg = String(error?.message || error);
|
|
1505
|
+
if (msg.toLowerCase().includes("kaggle package not installed")) {
|
|
1506
|
+
console.error("Kaggle support is optional and needs the official client: pip install kaggle");
|
|
1507
|
+
}
|
|
1508
|
+
else {
|
|
1509
|
+
console.error(`Kaggle discover failed: ${msg}`);
|
|
1510
|
+
}
|
|
1511
|
+
process.exit(1);
|
|
1512
|
+
}
|
|
1513
|
+
return;
|
|
1514
|
+
}
|
|
1515
|
+
const hf = new HuggingFaceScraper();
|
|
1516
|
+
const results = await hf.scrape(limit, true, query);
|
|
1517
|
+
console.log(formatSearchResults(results));
|
|
1518
|
+
}
|
|
1519
|
+
async function runDownloadCli(args) {
|
|
1520
|
+
// Usage: vespermcp download kaggle user/dataset-name [--target-dir C:/path]
|
|
1521
|
+
const targetIdx = args.findIndex(a => a === "--target-dir");
|
|
1522
|
+
const targetDir = targetIdx >= 0 && targetIdx + 1 < args.length ? args[targetIdx + 1] : undefined;
|
|
1523
|
+
const nonFlags = args.filter((a, i) => {
|
|
1524
|
+
if (a.startsWith("--"))
|
|
1525
|
+
return false;
|
|
1526
|
+
if (targetIdx >= 0 && i === targetIdx + 1)
|
|
1527
|
+
return false;
|
|
1528
|
+
return true;
|
|
1529
|
+
});
|
|
1530
|
+
const source = (nonFlags[1] || "").toLowerCase();
|
|
1531
|
+
const datasetId = nonFlags[2] || "";
|
|
1532
|
+
if (!source || !datasetId) {
|
|
1533
|
+
console.error("Usage: vespermcp download kaggle <username/dataset-name> [--target-dir C:/path]");
|
|
1534
|
+
process.exit(1);
|
|
1535
|
+
}
|
|
1536
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1537
|
+
console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
|
|
1538
|
+
if (process.stdin.isTTY) {
|
|
1539
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1540
|
+
const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
|
|
1541
|
+
rl.close();
|
|
1542
|
+
if (answer.trim().toLowerCase() === "y") {
|
|
1543
|
+
await runConfigCli(["config", "kaggle"]);
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
if (!dataIngestor.hasKaggleCredentials())
|
|
1547
|
+
process.exit(1);
|
|
1548
|
+
}
|
|
1549
|
+
let localPath = "";
|
|
1550
|
+
try {
|
|
1551
|
+
if (source === "kaggle" && targetDir) {
|
|
1552
|
+
const normalized = datasetId.includes("kaggle.com/datasets/")
|
|
1553
|
+
? datasetId.split("kaggle.com/datasets/")[1].replace(/^\//, "")
|
|
1554
|
+
: datasetId;
|
|
1555
|
+
const dl = await kaggleSource.download(normalized, targetDir);
|
|
1556
|
+
localPath = dl.local_path;
|
|
1557
|
+
const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
|
|
1558
|
+
metadataStore.registerDownload(normalized, localPath, "completed", size);
|
|
1559
|
+
}
|
|
1560
|
+
else {
|
|
1561
|
+
localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
|
|
1562
|
+
}
|
|
1563
|
+
}
|
|
1564
|
+
catch (error) {
|
|
1565
|
+
const msg = String(error?.message || error);
|
|
1566
|
+
if (source === "kaggle" && msg.toLowerCase().includes("kaggle package not installed")) {
|
|
1567
|
+
console.error("Kaggle support is optional and needs the official client: pip install kaggle");
|
|
1568
|
+
}
|
|
1569
|
+
else {
|
|
1570
|
+
console.error(`Download failed: ${msg}`);
|
|
1571
|
+
}
|
|
1572
|
+
process.exit(1);
|
|
1573
|
+
}
|
|
1574
|
+
console.log(`Download complete: ${localPath}`);
|
|
1575
|
+
}
|
|
1576
|
+
async function runFuseCli(args) {
|
|
1577
|
+
const getArgValue = (name) => {
|
|
1578
|
+
const idx = args.findIndex(a => a === name);
|
|
1579
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
1580
|
+
return args[idx + 1];
|
|
1581
|
+
return undefined;
|
|
1582
|
+
};
|
|
1583
|
+
const collectListAfter = (name) => {
|
|
1584
|
+
const idx = args.findIndex(a => a === name);
|
|
1585
|
+
if (idx < 0)
|
|
1586
|
+
return [];
|
|
1587
|
+
const out = [];
|
|
1588
|
+
for (let i = idx + 1; i < args.length; i++) {
|
|
1589
|
+
if (args[i].startsWith("--"))
|
|
1590
|
+
break;
|
|
1591
|
+
out.push(args[i]);
|
|
1592
|
+
}
|
|
1593
|
+
return out;
|
|
1594
|
+
};
|
|
1595
|
+
const sources = collectListAfter("--sources");
|
|
1596
|
+
if (sources.length < 2) {
|
|
1597
|
+
console.error("Usage: vespermcp fuse --sources <file1> <file2> [more] --strategy concat|join [--on id] [--how inner|left|outer] [--dedup] [--quality] [--leakage] [--format feather|parquet|csv|jsonl|arrow]");
|
|
1598
|
+
process.exit(1);
|
|
1599
|
+
}
|
|
1600
|
+
const strategy = getArgValue("--strategy") || "concat";
|
|
1601
|
+
const onValue = getArgValue("--on");
|
|
1602
|
+
const joinOn = onValue ? onValue.split(",").map(s => s.trim()).filter(Boolean) : undefined;
|
|
1603
|
+
const how = getArgValue("--how") || "inner";
|
|
1604
|
+
const outputFormat = getArgValue("--format") || "feather";
|
|
1605
|
+
const compression = getArgValue("--compression");
|
|
1606
|
+
const outputPath = getArgValue("--output") || path.join(process.cwd(), `fused_${Date.now()}.${outputFormat === "arrow" ? "arrow" : outputFormat}`);
|
|
1607
|
+
const dedup = args.includes("--dedup");
|
|
1608
|
+
const runQualityAfter = args.includes("--quality");
|
|
1609
|
+
const leakageCheck = args.includes("--leakage");
|
|
1610
|
+
const preview = !args.includes("--no-preview");
|
|
1611
|
+
const result = await fusionEngine.fuse(sources, outputPath, {
|
|
1612
|
+
strategy,
|
|
1613
|
+
join_on: joinOn,
|
|
1614
|
+
how,
|
|
1615
|
+
dedup,
|
|
1616
|
+
run_quality_after: runQualityAfter,
|
|
1617
|
+
leakage_check: leakageCheck,
|
|
1618
|
+
output_format: outputFormat,
|
|
1619
|
+
compression,
|
|
1620
|
+
preview,
|
|
1621
|
+
});
|
|
1622
|
+
const nullDelta = result.stats.null_delta;
|
|
1623
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1624
|
+
console.log(`Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).`);
|
|
1625
|
+
console.log(`Null increase: ${nullText}`);
|
|
1626
|
+
console.log(`Output: ${result.output_path}`);
|
|
1627
|
+
if (result.preview_path)
|
|
1628
|
+
console.log(`Preview saved: ${result.preview_path}`);
|
|
1629
|
+
console.log("Next: run vespermcp split/export on the fused dataset");
|
|
1630
|
+
}
|
|
781
1631
|
async function runSetupWizard(silent = false) {
|
|
782
1632
|
const configManager = new ConfigManager();
|
|
783
1633
|
if (!silent) {
|
|
784
|
-
console.log(`\
|
|
1634
|
+
console.log(`\nVesper MCP - Universal Setup`);
|
|
785
1635
|
console.log(`================================`);
|
|
786
1636
|
console.log(`Installing to all detected coding agents...\n`);
|
|
787
1637
|
}
|
|
788
|
-
const result = await configManager.installToAll();
|
|
1638
|
+
const result = await runWithSpinner("Installing to detected coding agents", () => configManager.installToAll());
|
|
789
1639
|
if (result.success.length === 0 && result.failed.length === 0) {
|
|
790
1640
|
if (!silent) {
|
|
791
|
-
console.log("\
|
|
1641
|
+
console.log("\nNo supported agents detected.");
|
|
792
1642
|
console.log("Supported agents: Claude Code, Claude Desktop, Cursor, VS Code, Codex, Antigravity");
|
|
793
1643
|
console.log("\nMake sure at least one is installed, then try again.");
|
|
794
1644
|
}
|
|
795
1645
|
return;
|
|
796
1646
|
}
|
|
797
1647
|
if (!silent) {
|
|
798
|
-
console.log("
|
|
1648
|
+
console.log("Setup complete! Please RESTART your IDE(s) to apply changes.");
|
|
799
1649
|
}
|
|
800
1650
|
}
|
|
801
1651
|
main().catch((error) => {
|