@vespermcp/mcp-server 1.2.14 ā 1.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/build/config/config-manager.js +1 -1
- package/build/index.js +128 -31
- package/build/ingestion/hf-downloader.js +71 -3
- package/build/ingestion/ingestor.js +6 -3
- package/build/python/export_engine.py +16 -0
- package/build/python/quality_engine.py +32 -8
- package/build/tools/formatter.js +6 -0
- package/mcp-config-template.json +5 -2
- package/package.json +3 -2
- package/scripts/wizard.js +307 -0
- package/src/python/export_engine.py +16 -0
- package/src/python/quality_engine.py +32 -8
- package/src/scripts/wizard.js +0 -77
package/README.md
CHANGED
|
@@ -36,7 +36,7 @@ Vesper is a Model Context Protocol (MCP) server that helps you find, analyze, an
|
|
|
36
36
|
The fastest way to install Vesper and configure it for **GitHub Copilot Chat** or **Cursor** is to run the automated setup:
|
|
37
37
|
|
|
38
38
|
```bash
|
|
39
|
-
npx -y @vespermcp/mcp-server@latest --setup
|
|
39
|
+
npx -y -p @vespermcp/mcp-server@latest vespermcp --setup
|
|
40
40
|
```
|
|
41
41
|
|
|
42
42
|
1. Select **Visual Studio Code (Settings.json)** from the list.
|
|
@@ -91,7 +91,7 @@ export class ConfigManager {
|
|
|
91
91
|
const isWin = process.platform === "win32";
|
|
92
92
|
return {
|
|
93
93
|
command: isWin ? "npx.cmd" : "npx",
|
|
94
|
-
args: ["-y", "@vespermcp/mcp-server@latest"],
|
|
94
|
+
args: ["-y", "-p", "@vespermcp/mcp-server@latest", "vespermcp"],
|
|
95
95
|
};
|
|
96
96
|
}
|
|
97
97
|
/**
|
package/build/index.js
CHANGED
|
@@ -43,11 +43,12 @@ function upsertRegistry(dataset_id, local_path, status) {
|
|
|
43
43
|
function getRegistryEntry(dataset_id) {
|
|
44
44
|
const norm_id = normalize_dataset_id(dataset_id);
|
|
45
45
|
console.error(`[Registry] Lookup key: ${norm_id}`);
|
|
46
|
-
return readRegistry().find(e => e.dataset_id === norm_id);
|
|
46
|
+
return readRegistry().find(e => (e.dataset_id || e.id) === norm_id);
|
|
47
47
|
}
|
|
48
48
|
// --- Pipeline State Tracker ---
|
|
49
49
|
// Tracks completed steps per session/job/dataset
|
|
50
50
|
const pipelineState = {};
|
|
51
|
+
const jobStatusLastPoll = {};
|
|
51
52
|
function getPipelineKey(datasetId) {
|
|
52
53
|
return datasetId;
|
|
53
54
|
}
|
|
@@ -77,6 +78,7 @@ import { fileURLToPath } from "url";
|
|
|
77
78
|
import path from "path";
|
|
78
79
|
import fs from "fs";
|
|
79
80
|
import { spawn } from "child_process";
|
|
81
|
+
import { spawnSync } from "child_process";
|
|
80
82
|
import { MetadataStore } from "./metadata/store.js";
|
|
81
83
|
import { VectorStore } from "./search/vector-store.js";
|
|
82
84
|
import { Embedder } from "./search/embedder.js";
|
|
@@ -348,7 +350,7 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
348
350
|
let shouldCopy = true;
|
|
349
351
|
if (fs.existsSync(destPath)) {
|
|
350
352
|
const destStat = fs.statSync(destPath);
|
|
351
|
-
if (srcStat.size === destStat.size)
|
|
353
|
+
if (srcStat.size === destStat.size && srcStat.mtimeMs <= destStat.mtimeMs)
|
|
352
354
|
shouldCopy = false;
|
|
353
355
|
}
|
|
354
356
|
if (shouldCopy) {
|
|
@@ -450,17 +452,55 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
450
452
|
async function handlePrepareJob(jobId, query, requirements) {
|
|
451
453
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
452
454
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
455
|
+
let selectedDataset;
|
|
456
|
+
let datasetIdForDownload = "";
|
|
457
|
+
let source;
|
|
458
|
+
const parsedQuery = parseDatasetId(query);
|
|
459
|
+
const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
|
|
460
|
+
if (isExplicitDatasetRef) {
|
|
461
|
+
let explicitId = parsedQuery;
|
|
462
|
+
if (/^hf:/i.test(explicitId)) {
|
|
463
|
+
explicitId = explicitId.replace(/^hf:/i, "huggingface:");
|
|
464
|
+
}
|
|
465
|
+
if (/^kaggle:/i.test(explicitId)) {
|
|
466
|
+
source = "kaggle";
|
|
467
|
+
datasetIdForDownload = explicitId.replace(/^kaggle:/i, "");
|
|
468
|
+
}
|
|
469
|
+
else if (/^huggingface:/i.test(explicitId)) {
|
|
470
|
+
source = "huggingface";
|
|
471
|
+
datasetIdForDownload = explicitId.replace(/^huggingface:/i, "");
|
|
472
|
+
}
|
|
473
|
+
else if (/^openml:/i.test(explicitId)) {
|
|
474
|
+
source = "openml";
|
|
475
|
+
datasetIdForDownload = explicitId.replace(/^openml:/i, "");
|
|
476
|
+
}
|
|
477
|
+
else if (/^dataworld:/i.test(explicitId)) {
|
|
478
|
+
source = "dataworld";
|
|
479
|
+
datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
|
|
480
|
+
}
|
|
481
|
+
else {
|
|
482
|
+
source = "kaggle";
|
|
483
|
+
datasetIdForDownload = explicitId;
|
|
484
|
+
}
|
|
485
|
+
update({
|
|
486
|
+
progress: 20,
|
|
487
|
+
status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
else {
|
|
491
|
+
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
492
|
+
const results = await searchEngine.search(query, { limit: 1 });
|
|
493
|
+
if (results.length === 0) {
|
|
494
|
+
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
495
|
+
}
|
|
496
|
+
selectedDataset = results[0];
|
|
497
|
+
datasetIdForDownload = selectedDataset.id;
|
|
498
|
+
source = selectedDataset.source;
|
|
499
|
+
update({
|
|
500
|
+
progress: 20,
|
|
501
|
+
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
502
|
+
});
|
|
457
503
|
}
|
|
458
|
-
const topDataset = results[0];
|
|
459
|
-
update({
|
|
460
|
-
progress: 20,
|
|
461
|
-
status_text: `Matched: ${topDataset.name} (${topDataset.source})`
|
|
462
|
-
});
|
|
463
|
-
const source = topDataset.source;
|
|
464
504
|
// Pre-check credentials for Kaggle
|
|
465
505
|
if (source === "kaggle") {
|
|
466
506
|
if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
|
|
@@ -470,10 +510,10 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
470
510
|
}
|
|
471
511
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
472
512
|
// ensureData handles download and returns path to the raw file
|
|
473
|
-
let rawFilePath = await dataIngestor.ensureData(
|
|
513
|
+
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
474
514
|
update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
|
|
475
515
|
});
|
|
476
|
-
if (requestedRows && requestedRows > 0) {
|
|
516
|
+
if (requestedRows && requestedRows > 0 && !isExplicitDatasetRef) {
|
|
477
517
|
update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
|
|
478
518
|
let currentRows = await countRows(rawFilePath);
|
|
479
519
|
if (currentRows < requestedRows) {
|
|
@@ -482,7 +522,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
482
522
|
const sourceFiles = [rawFilePath];
|
|
483
523
|
let totalRows = currentRows;
|
|
484
524
|
for (const ds of additional) {
|
|
485
|
-
if (ds.id ===
|
|
525
|
+
if (ds.id === datasetIdForDownload)
|
|
486
526
|
continue;
|
|
487
527
|
try {
|
|
488
528
|
const dsSource = ds.source;
|
|
@@ -516,10 +556,10 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
516
556
|
rawFilePath = fusionResult.output_path;
|
|
517
557
|
try {
|
|
518
558
|
// Register fused output for this top dataset so export can find it
|
|
519
|
-
upsertRegistry(
|
|
559
|
+
upsertRegistry(datasetIdForDownload, rawFilePath, "completed");
|
|
520
560
|
}
|
|
521
561
|
catch (e) {
|
|
522
|
-
console.error(`[Registry] Failed to write registry for fused output ${
|
|
562
|
+
console.error(`[Registry] Failed to write registry for fused output ${datasetIdForDownload}: ${e?.message || e}`);
|
|
523
563
|
}
|
|
524
564
|
currentRows = await countRows(rawFilePath);
|
|
525
565
|
}
|
|
@@ -530,22 +570,31 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
530
570
|
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
531
571
|
}
|
|
532
572
|
}
|
|
573
|
+
let qualityScore = selectedDataset?.quality_score ?? 70;
|
|
533
574
|
update({ progress: 70, status_text: "Analyzing dataset quality..." });
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
575
|
+
try {
|
|
576
|
+
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
577
|
+
qualityScore = report.overall_score;
|
|
578
|
+
}
|
|
579
|
+
catch (error) {
|
|
580
|
+
console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
|
|
581
|
+
update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
|
|
582
|
+
}
|
|
583
|
+
if (selectedDataset) {
|
|
584
|
+
metadataStore.saveDataset({
|
|
585
|
+
...selectedDataset,
|
|
586
|
+
quality_score: qualityScore
|
|
587
|
+
});
|
|
588
|
+
}
|
|
540
589
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
541
|
-
const installPath = await installService.install(
|
|
590
|
+
const installPath = await installService.install(datasetIdForDownload, rawFilePath);
|
|
542
591
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
543
592
|
// Register prepared dataset in local registry for lookup by export/list tools
|
|
544
593
|
try {
|
|
545
|
-
upsertRegistry(
|
|
594
|
+
upsertRegistry(datasetIdForDownload, installPath, "completed");
|
|
546
595
|
}
|
|
547
596
|
catch (e) {
|
|
548
|
-
console.error(`[Registry] Failed to write registry for ${
|
|
597
|
+
console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
549
598
|
}
|
|
550
599
|
return installPath;
|
|
551
600
|
}
|
|
@@ -1443,6 +1492,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1443
1492
|
if (!job) {
|
|
1444
1493
|
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
1445
1494
|
}
|
|
1495
|
+
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
1496
|
+
const now = Date.now();
|
|
1497
|
+
const last = jobStatusLastPoll[jobId] || 0;
|
|
1498
|
+
const minPollMs = 3000;
|
|
1499
|
+
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
1500
|
+
const waitMs = minPollMs - (now - last);
|
|
1501
|
+
return {
|
|
1502
|
+
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
1503
|
+
};
|
|
1504
|
+
}
|
|
1505
|
+
jobStatusLastPoll[jobId] = now;
|
|
1446
1506
|
return {
|
|
1447
1507
|
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
1448
1508
|
};
|
|
@@ -1482,9 +1542,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1482
1542
|
catch (e) {
|
|
1483
1543
|
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
1484
1544
|
}
|
|
1485
|
-
// Poll for download status until local_path appears or timeout
|
|
1545
|
+
// Poll for download status or registry entry until local_path appears or timeout
|
|
1486
1546
|
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
1487
|
-
const maxWait =
|
|
1547
|
+
const maxWait = 120_000; // 120s
|
|
1488
1548
|
const interval = 2000;
|
|
1489
1549
|
let waited = 0;
|
|
1490
1550
|
while (waited < maxWait) {
|
|
@@ -1494,13 +1554,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1494
1554
|
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
1495
1555
|
break;
|
|
1496
1556
|
}
|
|
1557
|
+
const reg = getRegistryEntry(datasetId);
|
|
1558
|
+
const regPath = reg?.local_path || reg?.path;
|
|
1559
|
+
if (regPath && fs.existsSync(regPath)) {
|
|
1560
|
+
sourcePath = regPath;
|
|
1561
|
+
console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
|
|
1562
|
+
break;
|
|
1563
|
+
}
|
|
1497
1564
|
await wait(interval);
|
|
1498
1565
|
waited += interval;
|
|
1499
1566
|
}
|
|
1500
1567
|
// If still no sourcePath, return helpful error listing prepared datasets
|
|
1501
1568
|
if (!sourcePath) {
|
|
1502
1569
|
const entries = readRegistry();
|
|
1503
|
-
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id}: ${e.local_path}`).join("\n");
|
|
1570
|
+
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
1504
1571
|
return {
|
|
1505
1572
|
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
1506
1573
|
isError: true
|
|
@@ -1511,7 +1578,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1511
1578
|
if (!fastMode) {
|
|
1512
1579
|
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
1513
1580
|
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
1514
|
-
|
|
1581
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
1582
|
+
if (!pipelineCompatibleInput) {
|
|
1583
|
+
console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
|
|
1584
|
+
}
|
|
1585
|
+
else if (currentExt !== pipelineFmt) {
|
|
1515
1586
|
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
1516
1587
|
try {
|
|
1517
1588
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
@@ -1853,7 +1924,15 @@ async function main() {
|
|
|
1853
1924
|
const transport = new StdioServerTransport();
|
|
1854
1925
|
await server.connect(transport);
|
|
1855
1926
|
console.error("Vesper MCP server running on stdio");
|
|
1856
|
-
console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
|
|
1927
|
+
console.error("Tip: To configure Vesper for your IDE, run: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup");
|
|
1928
|
+
await new Promise((resolve) => {
|
|
1929
|
+
const done = () => resolve();
|
|
1930
|
+
process.stdin.resume();
|
|
1931
|
+
process.stdin.once("end", done);
|
|
1932
|
+
process.stdin.once("close", done);
|
|
1933
|
+
process.once("SIGINT", done);
|
|
1934
|
+
process.once("SIGTERM", done);
|
|
1935
|
+
});
|
|
1857
1936
|
console.error("[Vesper] Main loop finished");
|
|
1858
1937
|
}
|
|
1859
1938
|
async function runConfigCli(args) {
|
|
@@ -2161,6 +2240,24 @@ async function runFuseCli(args) {
|
|
|
2161
2240
|
console.log("Next: run vespermcp split/export on the fused dataset");
|
|
2162
2241
|
}
|
|
2163
2242
|
async function runSetupWizard(silent = false) {
|
|
2243
|
+
if (!silent && process.stdin.isTTY) {
|
|
2244
|
+
const wizardCandidates = [
|
|
2245
|
+
path.join(appRoot, "scripts", "wizard.js"),
|
|
2246
|
+
path.join(appRoot, "src", "scripts", "wizard.js"),
|
|
2247
|
+
path.join(process.cwd(), "vesper-wizard", "wizard.js"),
|
|
2248
|
+
];
|
|
2249
|
+
const wizardScript = wizardCandidates.find(candidate => fs.existsSync(candidate));
|
|
2250
|
+
if (wizardScript) {
|
|
2251
|
+
console.error("[Vesper Setup] Running guided wizard...");
|
|
2252
|
+
const result = spawnSync(process.execPath, [wizardScript], {
|
|
2253
|
+
stdio: "inherit",
|
|
2254
|
+
env: process.env,
|
|
2255
|
+
});
|
|
2256
|
+
if ((result.status ?? 1) !== 0) {
|
|
2257
|
+
console.error("[Vesper Setup] Wizard exited with non-zero status, continuing with automatic MCP config only.");
|
|
2258
|
+
}
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2164
2261
|
const configManager = new ConfigManager();
|
|
2165
2262
|
if (!silent) {
|
|
2166
2263
|
console.error(`\nVesper MCP - Universal Setup`);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { listFiles } from "@huggingface/hub";
|
|
2
|
+
import fs from "fs";
|
|
2
3
|
import path from "path";
|
|
3
4
|
import { RobustDownloader } from "../utils/downloader.js";
|
|
4
5
|
export class HFDownloader {
|
|
@@ -19,6 +20,7 @@ export class HFDownloader {
|
|
|
19
20
|
try {
|
|
20
21
|
const token = this.getToken();
|
|
21
22
|
const files = [];
|
|
23
|
+
const metadataFiles = [];
|
|
22
24
|
const blacklist = [
|
|
23
25
|
".gitattributes",
|
|
24
26
|
".gitignore",
|
|
@@ -29,6 +31,15 @@ export class HFDownloader {
|
|
|
29
31
|
"requirements.txt",
|
|
30
32
|
"setup.py"
|
|
31
33
|
];
|
|
34
|
+
const metadataNamePatterns = [
|
|
35
|
+
/^dataset_infos?\.json$/i,
|
|
36
|
+
/^dataset_dict\.json$/i,
|
|
37
|
+
/^state\.json$/i,
|
|
38
|
+
/^config\.json$/i,
|
|
39
|
+
/^metadata\.json$/i,
|
|
40
|
+
/^stats\.json$/i,
|
|
41
|
+
/^index\.json$/i
|
|
42
|
+
];
|
|
32
43
|
for await (const file of listFiles({
|
|
33
44
|
repo: { type: "dataset", name: repoId },
|
|
34
45
|
recursive: true,
|
|
@@ -36,7 +47,11 @@ export class HFDownloader {
|
|
|
36
47
|
})) {
|
|
37
48
|
if (file.type === "file") {
|
|
38
49
|
const fileName = path.basename(file.path);
|
|
39
|
-
|
|
50
|
+
const isMetadataJson = metadataNamePatterns.some(p => p.test(fileName));
|
|
51
|
+
if (isMetadataJson) {
|
|
52
|
+
metadataFiles.push(file.path);
|
|
53
|
+
}
|
|
54
|
+
if (!blacklist.includes(fileName) && !fileName.startsWith(".") && !isMetadataJson) {
|
|
40
55
|
files.push(file.path);
|
|
41
56
|
}
|
|
42
57
|
}
|
|
@@ -49,7 +64,15 @@ export class HFDownloader {
|
|
|
49
64
|
/train.*\.csv$/i,
|
|
50
65
|
/data.*\.csv$/i,
|
|
51
66
|
/.*\.csv$/i,
|
|
67
|
+
/train.*\.tsv$/i,
|
|
68
|
+
/data.*\.tsv$/i,
|
|
69
|
+
/.*\.tsv$/i,
|
|
70
|
+
/train.*\.txt$/i,
|
|
71
|
+
/data.*\.txt$/i,
|
|
72
|
+
/.*\.txt$/i,
|
|
52
73
|
/.*\.jsonl$/i,
|
|
74
|
+
/.*\.ndjson$/i,
|
|
75
|
+
// Keep plain JSON as lowest priority to avoid selecting metadata-like files.
|
|
53
76
|
/.*\.json$/i
|
|
54
77
|
];
|
|
55
78
|
for (const pattern of priorities) {
|
|
@@ -58,12 +81,16 @@ export class HFDownloader {
|
|
|
58
81
|
return match;
|
|
59
82
|
}
|
|
60
83
|
// Strict fallback: Only return the first file if it has a data-like extension
|
|
61
|
-
const dataExtensions = [".csv", ".parquet", ".jsonl", ".
|
|
84
|
+
const dataExtensions = [".csv", ".parquet", ".jsonl", ".ndjson", ".tsv", ".txt", ".json", ".avro", ".orc"];
|
|
62
85
|
const fallback = files.find(f => {
|
|
63
86
|
const ext = path.extname(f).toLowerCase();
|
|
64
87
|
return dataExtensions.includes(ext);
|
|
65
88
|
});
|
|
66
|
-
|
|
89
|
+
if (fallback)
|
|
90
|
+
return fallback;
|
|
91
|
+
// Last-resort: allow dataset metadata file, then resolve external raw URLs later.
|
|
92
|
+
const metadataFallback = metadataFiles.find(f => /dataset_infos?\.json$/i.test(path.basename(f)));
|
|
93
|
+
return metadataFallback || null;
|
|
67
94
|
}
|
|
68
95
|
catch (error) {
|
|
69
96
|
const msg = String(error?.message || error);
|
|
@@ -90,4 +117,45 @@ export class HFDownloader {
|
|
|
90
117
|
}
|
|
91
118
|
});
|
|
92
119
|
}
|
|
120
|
+
/**
|
|
121
|
+
* If downloaded file is dataset metadata (dataset_infos.json), resolve and download a real data URL.
|
|
122
|
+
* Returns the actual local data path to use.
|
|
123
|
+
*/
|
|
124
|
+
async resolveExternalDataFromMetadata(localPath, onProgress) {
|
|
125
|
+
const ext = path.extname(localPath).toLowerCase();
|
|
126
|
+
if (ext !== ".json") {
|
|
127
|
+
return localPath;
|
|
128
|
+
}
|
|
129
|
+
try {
|
|
130
|
+
const raw = fs.readFileSync(localPath, "utf-8");
|
|
131
|
+
const parsed = JSON.parse(raw);
|
|
132
|
+
const firstConfig = parsed?.default || Object.values(parsed || {})[0];
|
|
133
|
+
const checksums = firstConfig?.download_checksums;
|
|
134
|
+
if (!checksums || typeof checksums !== "object") {
|
|
135
|
+
return localPath;
|
|
136
|
+
}
|
|
137
|
+
const candidateUrls = Object.keys(checksums).filter((u) => /^https?:\/\//i.test(u));
|
|
138
|
+
if (candidateUrls.length === 0) {
|
|
139
|
+
return localPath;
|
|
140
|
+
}
|
|
141
|
+
const preferred = candidateUrls.find(u => /train|data/i.test(path.basename(u))) || candidateUrls[0];
|
|
142
|
+
const ext = path.extname(preferred).toLowerCase() || ".csv";
|
|
143
|
+
const resolvedPath = localPath.replace(/\.json$/i, ext);
|
|
144
|
+
await this.downloader.download(preferred, resolvedPath, {
|
|
145
|
+
resume: true,
|
|
146
|
+
onProgress: (bytes, total) => {
|
|
147
|
+
if (total > 0 && onProgress) {
|
|
148
|
+
onProgress(Math.round((bytes / total) * 100));
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
});
|
|
152
|
+
if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).size > 0) {
|
|
153
|
+
return resolvedPath;
|
|
154
|
+
}
|
|
155
|
+
return localPath;
|
|
156
|
+
}
|
|
157
|
+
catch {
|
|
158
|
+
return localPath;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
93
161
|
}
|
|
@@ -72,9 +72,12 @@ export class DataIngestor {
|
|
|
72
72
|
await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
|
|
73
73
|
onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
|
|
74
74
|
});
|
|
75
|
-
const
|
|
76
|
-
|
|
77
|
-
|
|
75
|
+
const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
|
|
76
|
+
onProgress?.("Resolving external dataset file...", progress);
|
|
77
|
+
});
|
|
78
|
+
const stats = fs.statSync(resolvedPath);
|
|
79
|
+
this.completeDownload(datasetId, resolvedPath, stats.size);
|
|
80
|
+
return resolvedPath;
|
|
78
81
|
}
|
|
79
82
|
catch (e) {
|
|
80
83
|
this.failDownload(datasetId, e.message);
|
|
@@ -31,6 +31,19 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
31
31
|
ext = os.path.splitext(file_path)[1].lower()
|
|
32
32
|
if ext == ".csv":
|
|
33
33
|
df = pl.read_csv(file_path, ignore_errors=True)
|
|
34
|
+
elif ext == ".tsv":
|
|
35
|
+
df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
|
|
36
|
+
elif ext == ".txt":
|
|
37
|
+
# Heuristic delimiter detection for plain text tabular files.
|
|
38
|
+
sep = ","
|
|
39
|
+
try:
|
|
40
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
41
|
+
first_line = fh.readline()
|
|
42
|
+
if "\t" in first_line:
|
|
43
|
+
sep = "\t"
|
|
44
|
+
except Exception:
|
|
45
|
+
sep = ","
|
|
46
|
+
df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
|
|
34
47
|
elif ext in (".parquet", ".pq"):
|
|
35
48
|
df = pl.read_parquet(file_path)
|
|
36
49
|
elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
@@ -40,6 +53,9 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
40
53
|
else:
|
|
41
54
|
raise ValueError(f"Unsupported input format: {ext}")
|
|
42
55
|
|
|
56
|
+
if len(df) == 0:
|
|
57
|
+
raise ValueError("empty CSV")
|
|
58
|
+
|
|
43
59
|
# Column selection (before sampling for speed)
|
|
44
60
|
if columns:
|
|
45
61
|
valid = [c for c in columns if c in df.columns]
|
|
@@ -102,6 +102,18 @@ def main():
|
|
|
102
102
|
file_path_lower = file_path.lower()
|
|
103
103
|
if file_path_lower.endswith(".csv"):
|
|
104
104
|
df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
|
|
105
|
+
elif file_path_lower.endswith(".tsv"):
|
|
106
|
+
df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
|
|
107
|
+
elif file_path_lower.endswith(".txt"):
|
|
108
|
+
sep = ","
|
|
109
|
+
try:
|
|
110
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
111
|
+
first_line = fh.readline()
|
|
112
|
+
if "\t" in first_line:
|
|
113
|
+
sep = "\t"
|
|
114
|
+
except Exception:
|
|
115
|
+
sep = ","
|
|
116
|
+
df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
|
|
105
117
|
elif file_path_lower.endswith(".parquet"):
|
|
106
118
|
try:
|
|
107
119
|
# Try scanning first (faster for large files)
|
|
@@ -133,10 +145,18 @@ def main():
|
|
|
133
145
|
column_count = len(df.columns)
|
|
134
146
|
|
|
135
147
|
# Duplicate detection (exact)
|
|
148
|
+
# NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
|
|
149
|
+
# Use a Python fallback that is slower but robust for the 10k sampled rows.
|
|
150
|
+
duplicate_count = 0
|
|
136
151
|
try:
|
|
137
|
-
|
|
152
|
+
seen = set()
|
|
153
|
+
for row in df.to_dicts():
|
|
154
|
+
row_key = json.dumps(row, sort_keys=True, default=str)
|
|
155
|
+
if row_key in seen:
|
|
156
|
+
duplicate_count += 1
|
|
157
|
+
else:
|
|
158
|
+
seen.add(row_key)
|
|
138
159
|
except Exception:
|
|
139
|
-
# Duplicate check might fail on complex nested types (List, Struct)
|
|
140
160
|
duplicate_count = 0
|
|
141
161
|
|
|
142
162
|
columns_stats = []
|
|
@@ -165,12 +185,16 @@ def main():
|
|
|
165
185
|
if duplicate_count == 0 and len(text_cols) > 0:
|
|
166
186
|
# Pick longest text column as likely "content"
|
|
167
187
|
# In real impl, we'd use heuristics. For now, first text col.
|
|
168
|
-
target_col = text_cols[0]
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
188
|
+
target_col = text_cols[0]
|
|
189
|
+
try:
|
|
190
|
+
text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
|
|
191
|
+
if text_dupes > 0:
|
|
192
|
+
report["text_duplicates"] = int(text_dupes)
|
|
193
|
+
if text_dupes > (row_count * 0.2):
|
|
194
|
+
report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
|
|
195
|
+
except Exception:
|
|
196
|
+
# Skip text duplicate warning if backend cannot compute duplicates for this dtype
|
|
197
|
+
pass
|
|
174
198
|
|
|
175
199
|
# Integrity Check 2: Contamination / Leakage (Basic)
|
|
176
200
|
# (Skipping correlation for now)
|
package/build/tools/formatter.js
CHANGED
|
@@ -20,6 +20,12 @@ export function formatJobStatus(job) {
|
|
|
20
20
|
output += `Status: ${statusText}\n`;
|
|
21
21
|
output += `Progress: ${bar} ${job.progress}%\n`;
|
|
22
22
|
output += `Activity: ${job.status_text}\n`;
|
|
23
|
+
if (job.status === "running" || job.status === "retrying" || job.status === "queued" || job.status === "pending") {
|
|
24
|
+
output += `Polling hint: check again in 5-10 seconds.\n`;
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
output += `Polling hint: no further polling required.\n`;
|
|
28
|
+
}
|
|
23
29
|
if (job.result_url) {
|
|
24
30
|
output += `\nResult: ${job.result_url}\n`;
|
|
25
31
|
}
|
package/mcp-config-template.json
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"mcpServers": {
|
|
3
3
|
"vesper": {
|
|
4
|
-
"command": "
|
|
4
|
+
"command": "npx",
|
|
5
5
|
"args": [
|
|
6
|
-
"
|
|
6
|
+
"-y",
|
|
7
|
+
"-p",
|
|
8
|
+
"@vespermcp/mcp-server@latest",
|
|
9
|
+
"vespermcp"
|
|
7
10
|
],
|
|
8
11
|
"env": {
|
|
9
12
|
"KAGGLE_USERNAME": "your-kaggle-username",
|
package/package.json
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.16",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
7
7
|
"bin": {
|
|
8
8
|
"vespermcp": "./build/index.js",
|
|
9
|
+
"mcp-server": "./build/index.js",
|
|
9
10
|
"@vespermcp/mcp-server": "./build/index.js",
|
|
10
|
-
"vesper-wizard": "
|
|
11
|
+
"vesper-wizard": "scripts/wizard.js"
|
|
11
12
|
},
|
|
12
13
|
"files": [
|
|
13
14
|
"build/**/*",
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
4
|
+
// vesper-wizard ā Zero-friction local setup for Vesper MCP
|
|
5
|
+
// Run: npx vesper-wizard@latest
|
|
6
|
+
// āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
7
|
+
|
|
8
|
+
const fs = require('fs');
|
|
9
|
+
const path = require('path');
|
|
10
|
+
const os = require('os');
|
|
11
|
+
const crypto = require('crypto');
|
|
12
|
+
const { execSync, spawnSync } = require('child_process');
|
|
13
|
+
|
|
14
|
+
// āā Paths āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
15
|
+
const HOME = os.homedir();
|
|
16
|
+
const VESPER_DIR = path.join(HOME, '.vesper');
|
|
17
|
+
const CONFIG_TOML = path.join(VESPER_DIR, 'config.toml');
|
|
18
|
+
const DATA_DIR = path.join(VESPER_DIR, 'data');
|
|
19
|
+
const IS_WIN = process.platform === 'win32';
|
|
20
|
+
const APPDATA = process.env.APPDATA || path.join(HOME, 'AppData', 'Roaming');
|
|
21
|
+
|
|
22
|
+
// āā Helpers āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
23
|
+
function ensureDir(dir) {
|
|
24
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function generateLocalKey() {
|
|
28
|
+
const random = crypto.randomBytes(24).toString('hex');
|
|
29
|
+
return `vesper_sk_local_${random}`;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function readToml(filePath) {
|
|
33
|
+
if (!fs.existsSync(filePath)) return {};
|
|
34
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
35
|
+
const obj = {};
|
|
36
|
+
for (const line of content.split('\n')) {
|
|
37
|
+
const m = line.match(/^\s*(\w+)\s*=\s*"(.*)"\s*$/);
|
|
38
|
+
if (m) obj[m[1]] = m[2];
|
|
39
|
+
}
|
|
40
|
+
return obj;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function writeToml(filePath, data) {
|
|
44
|
+
ensureDir(path.dirname(filePath));
|
|
45
|
+
const lines = Object.entries(data).map(([k, v]) => `${k} = "${v}"`);
|
|
46
|
+
fs.writeFileSync(filePath, lines.join('\n') + '\n', 'utf8');
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function dim(text) { return `\x1b[2m${text}\x1b[0m`; }
|
|
50
|
+
function bold(text) { return `\x1b[1m${text}\x1b[0m`; }
|
|
51
|
+
function green(text) { return `\x1b[32m${text}\x1b[0m`; }
|
|
52
|
+
function cyan(text) { return `\x1b[36m${text}\x1b[0m`; }
|
|
53
|
+
function yellow(text) { return `\x1b[33m${text}\x1b[0m`; }
|
|
54
|
+
function red(text) { return `\x1b[31m${text}\x1b[0m`; }
|
|
55
|
+
function magenta(text) { return `\x1b[35m${text}\x1b[0m`; }
|
|
56
|
+
|
|
57
|
+
function printBanner() {
|
|
58
|
+
console.log(`
|
|
59
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
60
|
+
|
|
61
|
+
${bold('āā āā āāāāāāā āāāāāāā āāāāāā āāāāāāā āāāāāā')}
|
|
62
|
+
${bold('āā āā āā āā āā āā āā āā āā')}
|
|
63
|
+
${bold('āā āā āāāāā āāāāāāā āāāāāā āāāāā āāāāāā')}
|
|
64
|
+
${bold(' āā āā āā āā āā āā āā āā')}
|
|
65
|
+
${bold(' āāāā āāāāāāā āāāāāāā āā āāāāāāā āā āā')}
|
|
66
|
+
|
|
67
|
+
${cyan('dataset intelligence layer')}
|
|
68
|
+
${dim('local-first ⢠zero-config ⢠agent-native')}
|
|
69
|
+
|
|
70
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
71
|
+
`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// āā MCP Auto-Config āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
75
|
+
function getAllAgentConfigs() {
|
|
76
|
+
const isMac = process.platform === 'darwin';
|
|
77
|
+
return [
|
|
78
|
+
{
|
|
79
|
+
name: 'Claude Code',
|
|
80
|
+
path: path.join(HOME, '.claude.json'),
|
|
81
|
+
format: 'mcpServers',
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
name: 'Claude Desktop',
|
|
85
|
+
path: IS_WIN
|
|
86
|
+
? path.join(APPDATA, 'Claude', 'claude_desktop_config.json')
|
|
87
|
+
: isMac
|
|
88
|
+
? path.join(HOME, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json')
|
|
89
|
+
: path.join(HOME, '.config', 'claude', 'claude_desktop_config.json'),
|
|
90
|
+
format: 'mcpServers',
|
|
91
|
+
},
|
|
92
|
+
{
|
|
93
|
+
name: 'Cursor',
|
|
94
|
+
path: path.join(HOME, '.cursor', 'mcp.json'),
|
|
95
|
+
format: 'mcpServers',
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
name: 'VS Code',
|
|
99
|
+
path: IS_WIN
|
|
100
|
+
? path.join(APPDATA, 'Code', 'User', 'mcp.json')
|
|
101
|
+
: isMac
|
|
102
|
+
? path.join(HOME, 'Library', 'Application Support', 'Code', 'User', 'mcp.json')
|
|
103
|
+
: path.join(HOME, '.config', 'Code', 'User', 'mcp.json'),
|
|
104
|
+
format: 'servers',
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
name: 'Codex',
|
|
108
|
+
path: path.join(HOME, '.codex', 'config.toml'),
|
|
109
|
+
format: 'toml',
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
name: 'Gemini CLI',
|
|
113
|
+
path: path.join(HOME, '.gemini', 'settings.json'),
|
|
114
|
+
format: 'mcpServers',
|
|
115
|
+
},
|
|
116
|
+
];
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function installMcpToAgent(agent) {
|
|
120
|
+
const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
|
|
121
|
+
const serverEntry = { command: npxCmd, args: ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp'] };
|
|
122
|
+
|
|
123
|
+
try {
|
|
124
|
+
if (agent.format === 'toml') {
|
|
125
|
+
let content = fs.existsSync(agent.path) ? fs.readFileSync(agent.path, 'utf8') : '';
|
|
126
|
+
if (content.includes('[mcp_servers.vesper]')) return true;
|
|
127
|
+
ensureDir(path.dirname(agent.path));
|
|
128
|
+
content += `\n[mcp_servers.vesper]\ncommand = "${serverEntry.command}"\nargs = [${serverEntry.args.map(a => `"${a}"`).join(', ')}]\n`;
|
|
129
|
+
fs.writeFileSync(agent.path, content, 'utf8');
|
|
130
|
+
return true;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
let config = {};
|
|
134
|
+
if (fs.existsSync(agent.path)) {
|
|
135
|
+
try { config = JSON.parse(fs.readFileSync(agent.path, 'utf8').trim() || '{}'); } catch { config = {}; }
|
|
136
|
+
} else {
|
|
137
|
+
ensureDir(path.dirname(agent.path));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const key = agent.format === 'servers' ? 'servers' : 'mcpServers';
|
|
141
|
+
if (!config[key]) config[key] = {};
|
|
142
|
+
|
|
143
|
+
const entry = agent.format === 'servers'
|
|
144
|
+
? { type: 'stdio', ...serverEntry }
|
|
145
|
+
: serverEntry;
|
|
146
|
+
|
|
147
|
+
config[key].vesper = entry;
|
|
148
|
+
fs.writeFileSync(agent.path, JSON.stringify(config, null, 2), 'utf8');
|
|
149
|
+
return true;
|
|
150
|
+
} catch {
|
|
151
|
+
return false;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// āā Server Health Check āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
156
|
+
async function checkServerHealth() {
|
|
157
|
+
try {
|
|
158
|
+
// Quick stdio check ā spawn server and see if it responds
|
|
159
|
+
const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--version'], {
|
|
160
|
+
timeout: 10000,
|
|
161
|
+
encoding: 'utf8',
|
|
162
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
163
|
+
});
|
|
164
|
+
return result.status === 0 || (result.stderr && result.stderr.includes('Vesper'));
|
|
165
|
+
} catch {
|
|
166
|
+
return false;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// āā Main Wizard āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
171
|
+
async function main() {
|
|
172
|
+
printBanner();
|
|
173
|
+
|
|
174
|
+
console.log(` ${green('ā')} Setting up Vesper on ${bold(os.hostname())}\n`);
|
|
175
|
+
|
|
176
|
+
// āāā Step 1: Create directories āāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
177
|
+
process.stdout.write(` ${dim('[')}${cyan('1/6')}${dim(']')} Creating local directories...`);
|
|
178
|
+
ensureDir(VESPER_DIR);
|
|
179
|
+
ensureDir(DATA_DIR);
|
|
180
|
+
ensureDir(path.join(DATA_DIR, 'raw'));
|
|
181
|
+
ensureDir(path.join(DATA_DIR, 'processed'));
|
|
182
|
+
ensureDir(path.join(VESPER_DIR, 'datasets'));
|
|
183
|
+
console.log(` ${green('ā')}`);
|
|
184
|
+
|
|
185
|
+
// āāā Step 2: Generate local API key āāāāāāāāāāāāāāāāāāāāāāāā
|
|
186
|
+
process.stdout.write(` ${dim('[')}${cyan('2/6')}${dim(']')} Generating local API key...`);
|
|
187
|
+
const existing = readToml(CONFIG_TOML);
|
|
188
|
+
const localKey = existing.api_key || generateLocalKey();
|
|
189
|
+
const configData = { ...existing, api_key: localKey };
|
|
190
|
+
writeToml(CONFIG_TOML, configData);
|
|
191
|
+
console.log(` ${green('ā')}`);
|
|
192
|
+
console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 20) + '...')} ${dim('ā')} ${dim(CONFIG_TOML)}`);
|
|
193
|
+
|
|
194
|
+
// āāā Step 3: Local vault initialization āāāāāāāāāāāāāāāāāāāā
|
|
195
|
+
process.stdout.write(`\n ${dim('[')}${cyan('3/6')}${dim(']')} Initializing local credentials vault...`);
|
|
196
|
+
configData.auth_mode = configData.auth_mode || 'local_unified';
|
|
197
|
+
writeToml(CONFIG_TOML, configData);
|
|
198
|
+
console.log(` ${green('ā')}`);
|
|
199
|
+
console.log(` ${dim('Mode:')} ${dim('single local Vesper key (no external keys required)')}`);
|
|
200
|
+
|
|
201
|
+
// āāā Step 4: Install @vespermcp/mcp-server āāāāāāāāāāāāāāāāā
|
|
202
|
+
console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
|
|
203
|
+
try {
|
|
204
|
+
const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
|
|
205
|
+
spawnSync(npmCmd, ['-y', '-p', '@vespermcp/mcp-server@latest', 'vespermcp', '--setup', '--silent'], {
|
|
206
|
+
stdio: 'inherit',
|
|
207
|
+
timeout: 120000,
|
|
208
|
+
});
|
|
209
|
+
console.log(` ${green('ā')} @vespermcp/mcp-server installed`);
|
|
210
|
+
} catch {
|
|
211
|
+
console.log(` ${yellow('ā ')} Could not auto-install ā run manually: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup`);
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// āāā Step 5: Auto-configure all detected IDEs āāāāāāāāāāāāāā
|
|
215
|
+
process.stdout.write(`\n ${dim('[')}${cyan('5/6')}${dim(']')} Configuring coding agents...`);
|
|
216
|
+
const agents = getAllAgentConfigs();
|
|
217
|
+
const configuredAgents = [];
|
|
218
|
+
const skippedAgents = [];
|
|
219
|
+
|
|
220
|
+
for (const agent of agents) {
|
|
221
|
+
const dirExists = fs.existsSync(path.dirname(agent.path));
|
|
222
|
+
const fileExists = fs.existsSync(agent.path);
|
|
223
|
+
if (fileExists || dirExists) {
|
|
224
|
+
const ok = installMcpToAgent(agent);
|
|
225
|
+
if (ok) configuredAgents.push(agent.name);
|
|
226
|
+
else skippedAgents.push(agent.name);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
console.log(` ${green('ā')}`);
|
|
230
|
+
|
|
231
|
+
if (configuredAgents.length > 0) {
|
|
232
|
+
console.log(`\n āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā`);
|
|
233
|
+
console.log(` ā ${bold('MCP Auto-Configured')} ā`);
|
|
234
|
+
console.log(` āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā¤`);
|
|
235
|
+
for (const name of configuredAgents) {
|
|
236
|
+
console.log(` ā ${green('ā')} ${name.padEnd(42)}ā`);
|
|
237
|
+
}
|
|
238
|
+
console.log(` āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā`);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// āāā Step 6: Verify āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
242
|
+
console.log(`\n ${dim('[')}${cyan('6/6')}${dim(']')} Verifying installation...`);
|
|
243
|
+
|
|
244
|
+
const dbExists = fs.existsSync(path.join(DATA_DIR, 'metadata.db'));
|
|
245
|
+
const vecExists = fs.existsSync(path.join(DATA_DIR, 'vectors.json')) || fs.existsSync(path.join(DATA_DIR, 'vectors.bin'));
|
|
246
|
+
const keyStored = fs.existsSync(CONFIG_TOML);
|
|
247
|
+
|
|
248
|
+
console.log(` ${keyStored ? green('ā') : red('ā')} Local API key ${dim(CONFIG_TOML)}`);
|
|
249
|
+
console.log(` ${dbExists ? green('ā') : yellow('ā ')} Dataset index ${dim(dbExists ? 'ready' : 'will build on first search')}`);
|
|
250
|
+
console.log(` ${vecExists ? green('ā') : yellow('ā ')} Vector store ${dim(vecExists ? 'ready' : 'will build on first search')}`);
|
|
251
|
+
console.log(` ${configuredAgents.length > 0 ? green('ā') : yellow('ā ')} MCP agents ${dim(configuredAgents.length + ' configured')}`);
|
|
252
|
+
|
|
253
|
+
// āāā Final Summary āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
254
|
+
console.log(`
|
|
255
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
256
|
+
|
|
257
|
+
${green(bold('ā Vesper is ready!'))}
|
|
258
|
+
|
|
259
|
+
${bold('Your local API key:')}
|
|
260
|
+
${cyan(localKey)}
|
|
261
|
+
|
|
262
|
+
${bold('Config file:')}
|
|
263
|
+
${dim(CONFIG_TOML)}
|
|
264
|
+
|
|
265
|
+
${bold('What just happened:')}
|
|
266
|
+
${dim('1.')} Generated a local API key (never leaves your machine)
|
|
267
|
+
${dim('2.')} Initialized local credentials vault
|
|
268
|
+
${dim('3.')} Auto-configured MCP for ${configuredAgents.length > 0 ? configuredAgents.join(', ') : 'detected agents'}
|
|
269
|
+
${dim('4.')} Vesper server ready on stdio transport
|
|
270
|
+
|
|
271
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
272
|
+
|
|
273
|
+
${bold('Quick start ā try in your AI assistant:')}
|
|
274
|
+
|
|
275
|
+
${cyan('Search datasets')}
|
|
276
|
+
${dim('>')} vesper_search(query="sentiment analysis")
|
|
277
|
+
|
|
278
|
+
${cyan('Download & prepare')}
|
|
279
|
+
${dim('>')} prepare_dataset(query="image classification cats dogs")
|
|
280
|
+
|
|
281
|
+
${cyan('Quality analysis')}
|
|
282
|
+
${dim('>')} analyze_quality(dataset_id="imdb")
|
|
283
|
+
|
|
284
|
+
${cyan('Export to your project')}
|
|
285
|
+
${dim('>')} export_dataset(dataset_id="imdb", format="parquet")
|
|
286
|
+
|
|
287
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
288
|
+
|
|
289
|
+
${bold('Unified API ā one interface, every source:')}
|
|
290
|
+
HuggingFace Ā· Kaggle Ā· OpenML Ā· data.world
|
|
291
|
+
|
|
292
|
+
${dim('Agents call localhost Vesper APIs with one local key.')}
|
|
293
|
+
${dim('Vesper adapters handle provider routing internally.')}
|
|
294
|
+
|
|
295
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
296
|
+
|
|
297
|
+
${yellow('ā')} Restart your IDE to activate MCP
|
|
298
|
+
${dim('Docs:')} https://github.com/vesper/mcp-server
|
|
299
|
+
|
|
300
|
+
${dim('āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā')}
|
|
301
|
+
`);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
main().catch((err) => {
|
|
305
|
+
console.error(`\n${red('Error:')} ${err.message || err}`);
|
|
306
|
+
process.exit(1);
|
|
307
|
+
});
|
|
@@ -31,6 +31,19 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
31
31
|
ext = os.path.splitext(file_path)[1].lower()
|
|
32
32
|
if ext == ".csv":
|
|
33
33
|
df = pl.read_csv(file_path, ignore_errors=True)
|
|
34
|
+
elif ext == ".tsv":
|
|
35
|
+
df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
|
|
36
|
+
elif ext == ".txt":
|
|
37
|
+
# Heuristic delimiter detection for plain text tabular files.
|
|
38
|
+
sep = ","
|
|
39
|
+
try:
|
|
40
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
41
|
+
first_line = fh.readline()
|
|
42
|
+
if "\t" in first_line:
|
|
43
|
+
sep = "\t"
|
|
44
|
+
except Exception:
|
|
45
|
+
sep = ","
|
|
46
|
+
df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
|
|
34
47
|
elif ext in (".parquet", ".pq"):
|
|
35
48
|
df = pl.read_parquet(file_path)
|
|
36
49
|
elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
@@ -40,6 +53,9 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
40
53
|
else:
|
|
41
54
|
raise ValueError(f"Unsupported input format: {ext}")
|
|
42
55
|
|
|
56
|
+
if len(df) == 0:
|
|
57
|
+
raise ValueError("empty CSV")
|
|
58
|
+
|
|
43
59
|
# Column selection (before sampling for speed)
|
|
44
60
|
if columns:
|
|
45
61
|
valid = [c for c in columns if c in df.columns]
|
|
@@ -102,6 +102,18 @@ def main():
|
|
|
102
102
|
file_path_lower = file_path.lower()
|
|
103
103
|
if file_path_lower.endswith(".csv"):
|
|
104
104
|
df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
|
|
105
|
+
elif file_path_lower.endswith(".tsv"):
|
|
106
|
+
df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
|
|
107
|
+
elif file_path_lower.endswith(".txt"):
|
|
108
|
+
sep = ","
|
|
109
|
+
try:
|
|
110
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
111
|
+
first_line = fh.readline()
|
|
112
|
+
if "\t" in first_line:
|
|
113
|
+
sep = "\t"
|
|
114
|
+
except Exception:
|
|
115
|
+
sep = ","
|
|
116
|
+
df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
|
|
105
117
|
elif file_path_lower.endswith(".parquet"):
|
|
106
118
|
try:
|
|
107
119
|
# Try scanning first (faster for large files)
|
|
@@ -133,10 +145,18 @@ def main():
|
|
|
133
145
|
column_count = len(df.columns)
|
|
134
146
|
|
|
135
147
|
# Duplicate detection (exact)
|
|
148
|
+
# NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
|
|
149
|
+
# Use a Python fallback that is slower but robust for the 10k sampled rows.
|
|
150
|
+
duplicate_count = 0
|
|
136
151
|
try:
|
|
137
|
-
|
|
152
|
+
seen = set()
|
|
153
|
+
for row in df.to_dicts():
|
|
154
|
+
row_key = json.dumps(row, sort_keys=True, default=str)
|
|
155
|
+
if row_key in seen:
|
|
156
|
+
duplicate_count += 1
|
|
157
|
+
else:
|
|
158
|
+
seen.add(row_key)
|
|
138
159
|
except Exception:
|
|
139
|
-
# Duplicate check might fail on complex nested types (List, Struct)
|
|
140
160
|
duplicate_count = 0
|
|
141
161
|
|
|
142
162
|
columns_stats = []
|
|
@@ -165,12 +185,16 @@ def main():
|
|
|
165
185
|
if duplicate_count == 0 and len(text_cols) > 0:
|
|
166
186
|
# Pick longest text column as likely "content"
|
|
167
187
|
# In real impl, we'd use heuristics. For now, first text col.
|
|
168
|
-
target_col = text_cols[0]
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
188
|
+
target_col = text_cols[0]
|
|
189
|
+
try:
|
|
190
|
+
text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
|
|
191
|
+
if text_dupes > 0:
|
|
192
|
+
report["text_duplicates"] = int(text_dupes)
|
|
193
|
+
if text_dupes > (row_count * 0.2):
|
|
194
|
+
report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
|
|
195
|
+
except Exception:
|
|
196
|
+
# Skip text duplicate warning if backend cannot compute duplicates for this dtype
|
|
197
|
+
pass
|
|
174
198
|
|
|
175
199
|
# Integrity Check 2: Contamination / Leakage (Basic)
|
|
176
200
|
# (Skipping correlation for now)
|
package/src/scripts/wizard.js
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// Vesper Wizard CLI: Interactive setup for fast configuration
|
|
4
|
-
const inquirer = require('inquirer');
|
|
5
|
-
const fs = require('fs');
|
|
6
|
-
const path = require('path');
|
|
7
|
-
|
|
8
|
-
async function main() {
|
|
9
|
-
console.log('\nš§ Welcome to the Vesper Wizard!\n');
|
|
10
|
-
|
|
11
|
-
// Step 1: Project basics
|
|
12
|
-
const { projectName } = await inquirer.prompt([
|
|
13
|
-
{
|
|
14
|
-
type: 'input',
|
|
15
|
-
name: 'projectName',
|
|
16
|
-
message: 'Project name:',
|
|
17
|
-
default: path.basename(process.cwd()),
|
|
18
|
-
},
|
|
19
|
-
]);
|
|
20
|
-
|
|
21
|
-
// Step 2: Data directory
|
|
22
|
-
const { dataDir } = await inquirer.prompt([
|
|
23
|
-
{
|
|
24
|
-
type: 'input',
|
|
25
|
-
name: 'dataDir',
|
|
26
|
-
message: 'Path to your data directory:',
|
|
27
|
-
default: './datasets',
|
|
28
|
-
},
|
|
29
|
-
]);
|
|
30
|
-
|
|
31
|
-
// Step 3: Default export format
|
|
32
|
-
const { exportFormat } = await inquirer.prompt([
|
|
33
|
-
{
|
|
34
|
-
type: 'list',
|
|
35
|
-
name: 'exportFormat',
|
|
36
|
-
message: 'Default export format:',
|
|
37
|
-
choices: ['parquet', 'csv', 'feather'],
|
|
38
|
-
default: 'parquet',
|
|
39
|
-
},
|
|
40
|
-
]);
|
|
41
|
-
|
|
42
|
-
// Step 4: Add tokens/credentials
|
|
43
|
-
const { addTokens } = await inquirer.prompt([
|
|
44
|
-
{
|
|
45
|
-
type: 'confirm',
|
|
46
|
-
name: 'addTokens',
|
|
47
|
-
message: 'Would you like to add API tokens or credentials now?',
|
|
48
|
-
default: true,
|
|
49
|
-
},
|
|
50
|
-
]);
|
|
51
|
-
let tokens = {};
|
|
52
|
-
if (addTokens) {
|
|
53
|
-
const { kaggleToken } = await inquirer.prompt([
|
|
54
|
-
{
|
|
55
|
-
type: 'input',
|
|
56
|
-
name: 'kaggleToken',
|
|
57
|
-
message: 'Kaggle API token (leave blank to skip):',
|
|
58
|
-
},
|
|
59
|
-
]);
|
|
60
|
-
if (kaggleToken) tokens.kaggle = kaggleToken;
|
|
61
|
-
// Add more tokens as needed
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
// Step 5: Write config file
|
|
65
|
-
const config = {
|
|
66
|
-
project: projectName,
|
|
67
|
-
dataDir,
|
|
68
|
-
exportFormat,
|
|
69
|
-
tokens,
|
|
70
|
-
};
|
|
71
|
-
const configPath = path.join(process.cwd(), 'vesper-mcp-config.json');
|
|
72
|
-
fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
|
|
73
|
-
console.log(`\nā
Configuration saved to ${configPath}`);
|
|
74
|
-
console.log('\nš Vesper is ready to use!\n');
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
main();
|