@vespermcp/mcp-server 1.2.13 → 1.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/build/index.js CHANGED
@@ -43,11 +43,12 @@ function upsertRegistry(dataset_id, local_path, status) {
43
43
  function getRegistryEntry(dataset_id) {
44
44
  const norm_id = normalize_dataset_id(dataset_id);
45
45
  console.error(`[Registry] Lookup key: ${norm_id}`);
46
- return readRegistry().find(e => e.dataset_id === norm_id);
46
+ return readRegistry().find(e => (e.dataset_id || e.id) === norm_id);
47
47
  }
48
48
  // --- Pipeline State Tracker ---
49
49
  // Tracks completed steps per session/job/dataset
50
50
  const pipelineState = {};
51
+ const jobStatusLastPoll = {};
51
52
  function getPipelineKey(datasetId) {
52
53
  return datasetId;
53
54
  }
@@ -77,6 +78,7 @@ import { fileURLToPath } from "url";
77
78
  import path from "path";
78
79
  import fs from "fs";
79
80
  import { spawn } from "child_process";
81
+ import { spawnSync } from "child_process";
80
82
  import { MetadataStore } from "./metadata/store.js";
81
83
  import { VectorStore } from "./search/vector-store.js";
82
84
  import { Embedder } from "./search/embedder.js";
@@ -348,7 +350,7 @@ function syncPythonScripts(appRoot, dataRoot) {
348
350
  let shouldCopy = true;
349
351
  if (fs.existsSync(destPath)) {
350
352
  const destStat = fs.statSync(destPath);
351
- if (srcStat.size === destStat.size)
353
+ if (srcStat.size === destStat.size && srcStat.mtimeMs <= destStat.mtimeMs)
352
354
  shouldCopy = false;
353
355
  }
354
356
  if (shouldCopy) {
@@ -450,17 +452,55 @@ jobManager.on("processJob", async (job, execute) => {
450
452
  async function handlePrepareJob(jobId, query, requirements) {
451
453
  const update = (updates) => jobManager.updateJob(jobId, updates);
452
454
  const requestedRows = extractRequestedRows(query, requirements);
453
- update({ progress: 10, status_text: "Searching for best dataset matching query..." });
454
- const results = await searchEngine.search(query, { limit: 1 });
455
- if (results.length === 0) {
456
- throw new Error("No datasets found matching the query. Try refining your search terms.");
455
+ let selectedDataset;
456
+ let datasetIdForDownload = "";
457
+ let source;
458
+ const parsedQuery = parseDatasetId(query);
459
+ const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
460
+ if (isExplicitDatasetRef) {
461
+ let explicitId = parsedQuery;
462
+ if (/^hf:/i.test(explicitId)) {
463
+ explicitId = explicitId.replace(/^hf:/i, "huggingface:");
464
+ }
465
+ if (/^kaggle:/i.test(explicitId)) {
466
+ source = "kaggle";
467
+ datasetIdForDownload = explicitId.replace(/^kaggle:/i, "");
468
+ }
469
+ else if (/^huggingface:/i.test(explicitId)) {
470
+ source = "huggingface";
471
+ datasetIdForDownload = explicitId.replace(/^huggingface:/i, "");
472
+ }
473
+ else if (/^openml:/i.test(explicitId)) {
474
+ source = "openml";
475
+ datasetIdForDownload = explicitId.replace(/^openml:/i, "");
476
+ }
477
+ else if (/^dataworld:/i.test(explicitId)) {
478
+ source = "dataworld";
479
+ datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
480
+ }
481
+ else {
482
+ source = "kaggle";
483
+ datasetIdForDownload = explicitId;
484
+ }
485
+ update({
486
+ progress: 20,
487
+ status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
488
+ });
489
+ }
490
+ else {
491
+ update({ progress: 10, status_text: "Searching for best dataset matching query..." });
492
+ const results = await searchEngine.search(query, { limit: 1 });
493
+ if (results.length === 0) {
494
+ throw new Error("No datasets found matching the query. Try refining your search terms.");
495
+ }
496
+ selectedDataset = results[0];
497
+ datasetIdForDownload = selectedDataset.id;
498
+ source = selectedDataset.source;
499
+ update({
500
+ progress: 20,
501
+ status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
502
+ });
457
503
  }
458
- const topDataset = results[0];
459
- update({
460
- progress: 20,
461
- status_text: `Matched: ${topDataset.name} (${topDataset.source})`
462
- });
463
- const source = topDataset.source;
464
504
  // Pre-check credentials for Kaggle
465
505
  if (source === "kaggle") {
466
506
  if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
@@ -470,10 +510,10 @@ async function handlePrepareJob(jobId, query, requirements) {
470
510
  }
471
511
  update({ progress: 30, status_text: `Starting download from ${source}...` });
472
512
  // ensureData handles download and returns path to the raw file
473
- let rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
513
+ let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
474
514
  update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
475
515
  });
476
- if (requestedRows && requestedRows > 0) {
516
+ if (requestedRows && requestedRows > 0 && !isExplicitDatasetRef) {
477
517
  update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
478
518
  let currentRows = await countRows(rawFilePath);
479
519
  if (currentRows < requestedRows) {
@@ -482,7 +522,7 @@ async function handlePrepareJob(jobId, query, requirements) {
482
522
  const sourceFiles = [rawFilePath];
483
523
  let totalRows = currentRows;
484
524
  for (const ds of additional) {
485
- if (ds.id === topDataset.id)
525
+ if (ds.id === datasetIdForDownload)
486
526
  continue;
487
527
  try {
488
528
  const dsSource = ds.source;
@@ -516,10 +556,10 @@ async function handlePrepareJob(jobId, query, requirements) {
516
556
  rawFilePath = fusionResult.output_path;
517
557
  try {
518
558
  // Register fused output for this top dataset so export can find it
519
- upsertRegistry(topDataset.id, rawFilePath, "completed");
559
+ upsertRegistry(datasetIdForDownload, rawFilePath, "completed");
520
560
  }
521
561
  catch (e) {
522
- console.error(`[Registry] Failed to write registry for fused output ${topDataset.id}: ${e?.message || e}`);
562
+ console.error(`[Registry] Failed to write registry for fused output ${datasetIdForDownload}: ${e?.message || e}`);
523
563
  }
524
564
  currentRows = await countRows(rawFilePath);
525
565
  }
@@ -530,22 +570,31 @@ async function handlePrepareJob(jobId, query, requirements) {
530
570
  update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
531
571
  }
532
572
  }
573
+ let qualityScore = selectedDataset?.quality_score ?? 70;
533
574
  update({ progress: 70, status_text: "Analyzing dataset quality..." });
534
- const report = await qualityAnalyzer.analyze(rawFilePath);
535
- // Update local metadata with quality info
536
- metadataStore.saveDataset({
537
- ...topDataset,
538
- quality_score: report.overall_score
539
- });
575
+ try {
576
+ const report = await qualityAnalyzer.analyze(rawFilePath);
577
+ qualityScore = report.overall_score;
578
+ }
579
+ catch (error) {
580
+ console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
581
+ update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
582
+ }
583
+ if (selectedDataset) {
584
+ metadataStore.saveDataset({
585
+ ...selectedDataset,
586
+ quality_score: qualityScore
587
+ });
588
+ }
540
589
  update({ progress: 85, status_text: "Installing dataset into project..." });
541
- const installPath = await installService.install(topDataset.id, rawFilePath);
590
+ const installPath = await installService.install(datasetIdForDownload, rawFilePath);
542
591
  update({ progress: 100, status_text: "Preparation complete!" });
543
592
  // Register prepared dataset in local registry for lookup by export/list tools
544
593
  try {
545
- upsertRegistry(topDataset.id, installPath, "completed");
594
+ upsertRegistry(datasetIdForDownload, installPath, "completed");
546
595
  }
547
596
  catch (e) {
548
- console.error(`[Registry] Failed to write registry for ${topDataset.id}: ${e?.message || e}`);
597
+ console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
549
598
  }
550
599
  return installPath;
551
600
  }
@@ -1443,6 +1492,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1443
1492
  if (!job) {
1444
1493
  throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
1445
1494
  }
1495
+ const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
1496
+ const now = Date.now();
1497
+ const last = jobStatusLastPoll[jobId] || 0;
1498
+ const minPollMs = 3000;
1499
+ if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
1500
+ const waitMs = minPollMs - (now - last);
1501
+ return {
1502
+ content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
1503
+ };
1504
+ }
1505
+ jobStatusLastPoll[jobId] = now;
1446
1506
  return {
1447
1507
  content: [{ type: "text", text: formatJobStatus(job) }]
1448
1508
  };
@@ -1482,9 +1542,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1482
1542
  catch (e) {
1483
1543
  console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
1484
1544
  }
1485
- // Poll for download status until local_path appears or timeout
1545
+ // Poll for download status or registry entry until local_path appears or timeout
1486
1546
  const wait = (ms) => new Promise(res => setTimeout(res, ms));
1487
- const maxWait = 60_000; // 60s
1547
+ const maxWait = 120_000; // 120s
1488
1548
  const interval = 2000;
1489
1549
  let waited = 0;
1490
1550
  while (waited < maxWait) {
@@ -1494,13 +1554,20 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1494
1554
  console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
1495
1555
  break;
1496
1556
  }
1557
+ const reg = getRegistryEntry(datasetId);
1558
+ const regPath = reg?.local_path || reg?.path;
1559
+ if (regPath && fs.existsSync(regPath)) {
1560
+ sourcePath = regPath;
1561
+ console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
1562
+ break;
1563
+ }
1497
1564
  await wait(interval);
1498
1565
  waited += interval;
1499
1566
  }
1500
1567
  // If still no sourcePath, return helpful error listing prepared datasets
1501
1568
  if (!sourcePath) {
1502
1569
  const entries = readRegistry();
1503
- const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id}: ${e.local_path}`).join("\n");
1570
+ const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
1504
1571
  return {
1505
1572
  content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
1506
1573
  isError: true
@@ -1511,7 +1578,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1511
1578
  if (!fastMode) {
1512
1579
  const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
1513
1580
  const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
1514
- if (currentExt !== pipelineFmt) {
1581
+ const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
1582
+ if (!pipelineCompatibleInput) {
1583
+ console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
1584
+ }
1585
+ else if (currentExt !== pipelineFmt) {
1515
1586
  console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
1516
1587
  try {
1517
1588
  const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
@@ -1841,8 +1912,8 @@ async function main() {
1841
1912
  await runDownloadCli(args);
1842
1913
  return;
1843
1914
  }
1844
- // If run in setup mode OR in a terminal without args (human call), show setup wizard
1845
- if (isSetup || (process.stdin.isTTY && args.length === 0)) {
1915
+ // If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
1916
+ if (isSetup) {
1846
1917
  await runSetupWizard(isSilent);
1847
1918
  return;
1848
1919
  }
@@ -1854,7 +1925,15 @@ async function main() {
1854
1925
  await server.connect(transport);
1855
1926
  console.error("Vesper MCP server running on stdio");
1856
1927
  console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
1857
- console.log("[Vesper] Main loop finished");
1928
+ await new Promise((resolve) => {
1929
+ const done = () => resolve();
1930
+ process.stdin.resume();
1931
+ process.stdin.once("end", done);
1932
+ process.stdin.once("close", done);
1933
+ process.once("SIGINT", done);
1934
+ process.once("SIGTERM", done);
1935
+ });
1936
+ console.error("[Vesper] Main loop finished");
1858
1937
  }
1859
1938
  async function runConfigCli(args) {
1860
1939
  const isKeys = args.includes("keys");
@@ -2161,23 +2240,41 @@ async function runFuseCli(args) {
2161
2240
  console.log("Next: run vespermcp split/export on the fused dataset");
2162
2241
  }
2163
2242
  async function runSetupWizard(silent = false) {
2243
+ if (!silent && process.stdin.isTTY) {
2244
+ const wizardCandidates = [
2245
+ path.join(appRoot, "scripts", "wizard.js"),
2246
+ path.join(appRoot, "src", "scripts", "wizard.js"),
2247
+ path.join(process.cwd(), "vesper-wizard", "wizard.js"),
2248
+ ];
2249
+ const wizardScript = wizardCandidates.find(candidate => fs.existsSync(candidate));
2250
+ if (wizardScript) {
2251
+ console.error("[Vesper Setup] Running guided wizard...");
2252
+ const result = spawnSync(process.execPath, [wizardScript], {
2253
+ stdio: "inherit",
2254
+ env: process.env,
2255
+ });
2256
+ if ((result.status ?? 1) !== 0) {
2257
+ console.error("[Vesper Setup] Wizard exited with non-zero status, continuing with automatic MCP config only.");
2258
+ }
2259
+ }
2260
+ }
2164
2261
  const configManager = new ConfigManager();
2165
2262
  if (!silent) {
2166
- console.log(`\nVesper MCP - Universal Setup`);
2167
- console.log(`================================`);
2168
- console.log(`Installing to all detected coding agents...\n`);
2263
+ console.error(`\nVesper MCP - Universal Setup`);
2264
+ console.error(`================================`);
2265
+ console.error(`Installing to all detected coding agents...\n`);
2169
2266
  }
2170
2267
  const result = await runWithSpinner("Installing to detected coding agents", () => configManager.installToAll());
2171
2268
  if (result.success.length === 0 && result.failed.length === 0) {
2172
2269
  if (!silent) {
2173
- console.log("\nNo supported agents detected.");
2174
- console.log("Supported agents: Claude Code, Claude Desktop, Cursor, VS Code, Codex, Antigravity");
2175
- console.log("\nMake sure at least one is installed, then try again.");
2270
+ console.error("\nNo supported agents detected.");
2271
+ console.error("Supported agents: Claude Code, Claude Desktop, Cursor, VS Code, Codex, Antigravity");
2272
+ console.error("\nMake sure at least one is installed, then try again.");
2176
2273
  }
2177
2274
  return;
2178
2275
  }
2179
2276
  if (!silent) {
2180
- console.log("Setup complete! Please RESTART your IDE(s) to apply changes.");
2277
+ console.error("Setup complete! Please RESTART your IDE(s) to apply changes.");
2181
2278
  }
2182
2279
  }
2183
2280
  main().catch((error) => {
@@ -1,4 +1,5 @@
1
1
  import { listFiles } from "@huggingface/hub";
2
+ import fs from "fs";
2
3
  import path from "path";
3
4
  import { RobustDownloader } from "../utils/downloader.js";
4
5
  export class HFDownloader {
@@ -19,6 +20,7 @@ export class HFDownloader {
19
20
  try {
20
21
  const token = this.getToken();
21
22
  const files = [];
23
+ const metadataFiles = [];
22
24
  const blacklist = [
23
25
  ".gitattributes",
24
26
  ".gitignore",
@@ -29,6 +31,15 @@ export class HFDownloader {
29
31
  "requirements.txt",
30
32
  "setup.py"
31
33
  ];
34
+ const metadataNamePatterns = [
35
+ /^dataset_infos?\.json$/i,
36
+ /^dataset_dict\.json$/i,
37
+ /^state\.json$/i,
38
+ /^config\.json$/i,
39
+ /^metadata\.json$/i,
40
+ /^stats\.json$/i,
41
+ /^index\.json$/i
42
+ ];
32
43
  for await (const file of listFiles({
33
44
  repo: { type: "dataset", name: repoId },
34
45
  recursive: true,
@@ -36,7 +47,11 @@ export class HFDownloader {
36
47
  })) {
37
48
  if (file.type === "file") {
38
49
  const fileName = path.basename(file.path);
39
- if (!blacklist.includes(fileName) && !fileName.startsWith(".")) {
50
+ const isMetadataJson = metadataNamePatterns.some(p => p.test(fileName));
51
+ if (isMetadataJson) {
52
+ metadataFiles.push(file.path);
53
+ }
54
+ if (!blacklist.includes(fileName) && !fileName.startsWith(".") && !isMetadataJson) {
40
55
  files.push(file.path);
41
56
  }
42
57
  }
@@ -49,7 +64,15 @@ export class HFDownloader {
49
64
  /train.*\.csv$/i,
50
65
  /data.*\.csv$/i,
51
66
  /.*\.csv$/i,
67
+ /train.*\.tsv$/i,
68
+ /data.*\.tsv$/i,
69
+ /.*\.tsv$/i,
70
+ /train.*\.txt$/i,
71
+ /data.*\.txt$/i,
72
+ /.*\.txt$/i,
52
73
  /.*\.jsonl$/i,
74
+ /.*\.ndjson$/i,
75
+ // Keep plain JSON as lowest priority to avoid selecting metadata-like files.
53
76
  /.*\.json$/i
54
77
  ];
55
78
  for (const pattern of priorities) {
@@ -58,12 +81,16 @@ export class HFDownloader {
58
81
  return match;
59
82
  }
60
83
  // Strict fallback: Only return the first file if it has a data-like extension
61
- const dataExtensions = [".csv", ".parquet", ".jsonl", ".json", ".txt", ".tsv", ".avro", ".orc"];
84
+ const dataExtensions = [".csv", ".parquet", ".jsonl", ".ndjson", ".tsv", ".txt", ".json", ".avro", ".orc"];
62
85
  const fallback = files.find(f => {
63
86
  const ext = path.extname(f).toLowerCase();
64
87
  return dataExtensions.includes(ext);
65
88
  });
66
- return fallback || null;
89
+ if (fallback)
90
+ return fallback;
91
+ // Last-resort: allow dataset metadata file, then resolve external raw URLs later.
92
+ const metadataFallback = metadataFiles.find(f => /dataset_infos?\.json$/i.test(path.basename(f)));
93
+ return metadataFallback || null;
67
94
  }
68
95
  catch (error) {
69
96
  const msg = String(error?.message || error);
@@ -90,4 +117,45 @@ export class HFDownloader {
90
117
  }
91
118
  });
92
119
  }
120
+ /**
121
+ * If downloaded file is dataset metadata (dataset_infos.json), resolve and download a real data URL.
122
+ * Returns the actual local data path to use.
123
+ */
124
+ async resolveExternalDataFromMetadata(localPath, onProgress) {
125
+ const ext = path.extname(localPath).toLowerCase();
126
+ if (ext !== ".json") {
127
+ return localPath;
128
+ }
129
+ try {
130
+ const raw = fs.readFileSync(localPath, "utf-8");
131
+ const parsed = JSON.parse(raw);
132
+ const firstConfig = parsed?.default || Object.values(parsed || {})[0];
133
+ const checksums = firstConfig?.download_checksums;
134
+ if (!checksums || typeof checksums !== "object") {
135
+ return localPath;
136
+ }
137
+ const candidateUrls = Object.keys(checksums).filter((u) => /^https?:\/\//i.test(u));
138
+ if (candidateUrls.length === 0) {
139
+ return localPath;
140
+ }
141
+ const preferred = candidateUrls.find(u => /train|data/i.test(path.basename(u))) || candidateUrls[0];
142
+ const ext = path.extname(preferred).toLowerCase() || ".csv";
143
+ const resolvedPath = localPath.replace(/\.json$/i, ext);
144
+ await this.downloader.download(preferred, resolvedPath, {
145
+ resume: true,
146
+ onProgress: (bytes, total) => {
147
+ if (total > 0 && onProgress) {
148
+ onProgress(Math.round((bytes / total) * 100));
149
+ }
150
+ }
151
+ });
152
+ if (fs.existsSync(resolvedPath) && fs.statSync(resolvedPath).size > 0) {
153
+ return resolvedPath;
154
+ }
155
+ return localPath;
156
+ }
157
+ catch {
158
+ return localPath;
159
+ }
160
+ }
93
161
  }
@@ -72,9 +72,12 @@ export class DataIngestor {
72
72
  await this.hfDownloader.download(datasetId, remotePath, targetPath, (progress) => {
73
73
  onProgress?.(`Downloading ${path.basename(remotePath)}...`, progress);
74
74
  });
75
- const stats = fs.statSync(targetPath);
76
- this.completeDownload(datasetId, targetPath, stats.size);
77
- return targetPath;
75
+ const resolvedPath = await this.hfDownloader.resolveExternalDataFromMetadata(targetPath, (progress) => {
76
+ onProgress?.("Resolving external dataset file...", progress);
77
+ });
78
+ const stats = fs.statSync(resolvedPath);
79
+ this.completeDownload(datasetId, resolvedPath, stats.size);
80
+ return resolvedPath;
78
81
  }
79
82
  catch (e) {
80
83
  this.failDownload(datasetId, e.message);
@@ -31,6 +31,19 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
31
31
  ext = os.path.splitext(file_path)[1].lower()
32
32
  if ext == ".csv":
33
33
  df = pl.read_csv(file_path, ignore_errors=True)
34
+ elif ext == ".tsv":
35
+ df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
36
+ elif ext == ".txt":
37
+ # Heuristic delimiter detection for plain text tabular files.
38
+ sep = ","
39
+ try:
40
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
41
+ first_line = fh.readline()
42
+ if "\t" in first_line:
43
+ sep = "\t"
44
+ except Exception:
45
+ sep = ","
46
+ df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
34
47
  elif ext in (".parquet", ".pq"):
35
48
  df = pl.read_parquet(file_path)
36
49
  elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
@@ -40,6 +53,9 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
40
53
  else:
41
54
  raise ValueError(f"Unsupported input format: {ext}")
42
55
 
56
+ if len(df) == 0:
57
+ raise ValueError("empty CSV")
58
+
43
59
  # Column selection (before sampling for speed)
44
60
  if columns:
45
61
  valid = [c for c in columns if c in df.columns]
@@ -102,6 +102,18 @@ def main():
102
102
  file_path_lower = file_path.lower()
103
103
  if file_path_lower.endswith(".csv"):
104
104
  df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
105
+ elif file_path_lower.endswith(".tsv"):
106
+ df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
107
+ elif file_path_lower.endswith(".txt"):
108
+ sep = ","
109
+ try:
110
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
111
+ first_line = fh.readline()
112
+ if "\t" in first_line:
113
+ sep = "\t"
114
+ except Exception:
115
+ sep = ","
116
+ df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
105
117
  elif file_path_lower.endswith(".parquet"):
106
118
  try:
107
119
  # Try scanning first (faster for large files)
@@ -133,10 +145,18 @@ def main():
133
145
  column_count = len(df.columns)
134
146
 
135
147
  # Duplicate detection (exact)
148
+ # NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
149
+ # Use a Python fallback that is slower but robust for the 10k sampled rows.
150
+ duplicate_count = 0
136
151
  try:
137
- duplicate_count = df.is_duplicated().sum()
152
+ seen = set()
153
+ for row in df.to_dicts():
154
+ row_key = json.dumps(row, sort_keys=True, default=str)
155
+ if row_key in seen:
156
+ duplicate_count += 1
157
+ else:
158
+ seen.add(row_key)
138
159
  except Exception:
139
- # Duplicate check might fail on complex nested types (List, Struct)
140
160
  duplicate_count = 0
141
161
 
142
162
  columns_stats = []
@@ -165,12 +185,16 @@ def main():
165
185
  if duplicate_count == 0 and len(text_cols) > 0:
166
186
  # Pick longest text column as likely "content"
167
187
  # In real impl, we'd use heuristics. For now, first text col.
168
- target_col = text_cols[0]
169
- text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
170
- if text_dupes > 0:
171
- report["text_duplicates"] = int(text_dupes)
172
- if text_dupes > (row_count * 0.2):
173
- report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
188
+ target_col = text_cols[0]
189
+ try:
190
+ text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
191
+ if text_dupes > 0:
192
+ report["text_duplicates"] = int(text_dupes)
193
+ if text_dupes > (row_count * 0.2):
194
+ report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
195
+ except Exception:
196
+ # Skip text duplicate warning if backend cannot compute duplicates for this dtype
197
+ pass
174
198
 
175
199
  # Integrity Check 2: Contamination / Leakage (Basic)
176
200
  # (Skipping correlation for now)
@@ -20,6 +20,12 @@ export function formatJobStatus(job) {
20
20
  output += `Status: ${statusText}\n`;
21
21
  output += `Progress: ${bar} ${job.progress}%\n`;
22
22
  output += `Activity: ${job.status_text}\n`;
23
+ if (job.status === "running" || job.status === "retrying" || job.status === "queued" || job.status === "pending") {
24
+ output += `Polling hint: check again in 5-10 seconds.\n`;
25
+ }
26
+ else {
27
+ output += `Polling hint: no further polling required.\n`;
28
+ }
23
29
  if (job.result_url) {
24
30
  output += `\nResult: ${job.result_url}\n`;
25
31
  }
package/package.json CHANGED
@@ -1,13 +1,13 @@
1
1
  {
2
2
  "name": "@vespermcp/mcp-server",
3
- "version": "1.2.13",
3
+ "version": "1.2.15",
4
4
  "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
5
  "type": "module",
6
6
  "main": "build/index.js",
7
7
  "bin": {
8
8
  "vespermcp": "./build/index.js",
9
9
  "@vespermcp/mcp-server": "./build/index.js",
10
- "vesper-wizard": "src/scripts/wizard.js"
10
+ "vesper-wizard": "scripts/wizard.js"
11
11
  },
12
12
  "files": [
13
13
  "build/**/*",
@@ -0,0 +1,307 @@
1
+ #!/usr/bin/env node
2
+
3
+ // ─────────────────────────────────────────────────────────────
4
+ // vesper-wizard — Zero-friction local setup for Vesper MCP
5
+ // Run: npx vesper-wizard@latest
6
+ // ─────────────────────────────────────────────────────────────
7
+
8
+ const fs = require('fs');
9
+ const path = require('path');
10
+ const os = require('os');
11
+ const crypto = require('crypto');
12
+ const { execSync, spawnSync } = require('child_process');
13
+
14
+ // ── Paths ────────────────────────────────────────────────────
15
+ const HOME = os.homedir();
16
+ const VESPER_DIR = path.join(HOME, '.vesper');
17
+ const CONFIG_TOML = path.join(VESPER_DIR, 'config.toml');
18
+ const DATA_DIR = path.join(VESPER_DIR, 'data');
19
+ const IS_WIN = process.platform === 'win32';
20
+ const APPDATA = process.env.APPDATA || path.join(HOME, 'AppData', 'Roaming');
21
+
22
+ // ── Helpers ──────────────────────────────────────────────────
23
+ function ensureDir(dir) {
24
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
25
+ }
26
+
27
+ function generateLocalKey() {
28
+ const random = crypto.randomBytes(24).toString('hex');
29
+ return `vesper_sk_local_${random}`;
30
+ }
31
+
32
+ function readToml(filePath) {
33
+ if (!fs.existsSync(filePath)) return {};
34
+ const content = fs.readFileSync(filePath, 'utf8');
35
+ const obj = {};
36
+ for (const line of content.split('\n')) {
37
+ const m = line.match(/^\s*(\w+)\s*=\s*"(.*)"\s*$/);
38
+ if (m) obj[m[1]] = m[2];
39
+ }
40
+ return obj;
41
+ }
42
+
43
+ function writeToml(filePath, data) {
44
+ ensureDir(path.dirname(filePath));
45
+ const lines = Object.entries(data).map(([k, v]) => `${k} = "${v}"`);
46
+ fs.writeFileSync(filePath, lines.join('\n') + '\n', 'utf8');
47
+ }
48
+
49
+ function dim(text) { return `\x1b[2m${text}\x1b[0m`; }
50
+ function bold(text) { return `\x1b[1m${text}\x1b[0m`; }
51
+ function green(text) { return `\x1b[32m${text}\x1b[0m`; }
52
+ function cyan(text) { return `\x1b[36m${text}\x1b[0m`; }
53
+ function yellow(text) { return `\x1b[33m${text}\x1b[0m`; }
54
+ function red(text) { return `\x1b[31m${text}\x1b[0m`; }
55
+ function magenta(text) { return `\x1b[35m${text}\x1b[0m`; }
56
+
57
+ function printBanner() {
58
+ console.log(`
59
+ ${dim('─────────────────────────────────────────────────')}
60
+
61
+ ${bold('ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ')}
62
+ ${bold('ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ')}
63
+ ${bold('ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ')}
64
+ ${bold(' ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ')}
65
+ ${bold(' ā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆā–ˆ ā–ˆā–ˆ ā–ˆā–ˆ')}
66
+
67
+ ${cyan('dataset intelligence layer')}
68
+ ${dim('local-first • zero-config • agent-native')}
69
+
70
+ ${dim('─────────────────────────────────────────────────')}
71
+ `);
72
+ }
73
+
74
+ // ── MCP Auto-Config ──────────────────────────────────────────
75
+ function getAllAgentConfigs() {
76
+ const isMac = process.platform === 'darwin';
77
+ return [
78
+ {
79
+ name: 'Claude Code',
80
+ path: path.join(HOME, '.claude.json'),
81
+ format: 'mcpServers',
82
+ },
83
+ {
84
+ name: 'Claude Desktop',
85
+ path: IS_WIN
86
+ ? path.join(APPDATA, 'Claude', 'claude_desktop_config.json')
87
+ : isMac
88
+ ? path.join(HOME, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json')
89
+ : path.join(HOME, '.config', 'claude', 'claude_desktop_config.json'),
90
+ format: 'mcpServers',
91
+ },
92
+ {
93
+ name: 'Cursor',
94
+ path: path.join(HOME, '.cursor', 'mcp.json'),
95
+ format: 'mcpServers',
96
+ },
97
+ {
98
+ name: 'VS Code',
99
+ path: IS_WIN
100
+ ? path.join(APPDATA, 'Code', 'User', 'mcp.json')
101
+ : isMac
102
+ ? path.join(HOME, 'Library', 'Application Support', 'Code', 'User', 'mcp.json')
103
+ : path.join(HOME, '.config', 'Code', 'User', 'mcp.json'),
104
+ format: 'servers',
105
+ },
106
+ {
107
+ name: 'Codex',
108
+ path: path.join(HOME, '.codex', 'config.toml'),
109
+ format: 'toml',
110
+ },
111
+ {
112
+ name: 'Gemini CLI',
113
+ path: path.join(HOME, '.gemini', 'settings.json'),
114
+ format: 'mcpServers',
115
+ },
116
+ ];
117
+ }
118
+
119
+ function installMcpToAgent(agent) {
120
+ const npxCmd = IS_WIN ? 'npx.cmd' : 'npx';
121
+ const serverEntry = { command: npxCmd, args: ['-y', '@vespermcp/mcp-server@latest'] };
122
+
123
+ try {
124
+ if (agent.format === 'toml') {
125
+ let content = fs.existsSync(agent.path) ? fs.readFileSync(agent.path, 'utf8') : '';
126
+ if (content.includes('[mcp_servers.vesper]')) return true;
127
+ ensureDir(path.dirname(agent.path));
128
+ content += `\n[mcp_servers.vesper]\ncommand = "${serverEntry.command}"\nargs = [${serverEntry.args.map(a => `"${a}"`).join(', ')}]\n`;
129
+ fs.writeFileSync(agent.path, content, 'utf8');
130
+ return true;
131
+ }
132
+
133
+ let config = {};
134
+ if (fs.existsSync(agent.path)) {
135
+ try { config = JSON.parse(fs.readFileSync(agent.path, 'utf8').trim() || '{}'); } catch { config = {}; }
136
+ } else {
137
+ ensureDir(path.dirname(agent.path));
138
+ }
139
+
140
+ const key = agent.format === 'servers' ? 'servers' : 'mcpServers';
141
+ if (!config[key]) config[key] = {};
142
+
143
+ const entry = agent.format === 'servers'
144
+ ? { type: 'stdio', ...serverEntry }
145
+ : serverEntry;
146
+
147
+ config[key].vesper = entry;
148
+ fs.writeFileSync(agent.path, JSON.stringify(config, null, 2), 'utf8');
149
+ return true;
150
+ } catch {
151
+ return false;
152
+ }
153
+ }
154
+
155
+ // ── Server Health Check ──────────────────────────────────────
156
+ async function checkServerHealth() {
157
+ try {
158
+ // Quick stdio check — spawn server and see if it responds
159
+ const result = spawnSync(IS_WIN ? 'npx.cmd' : 'npx', ['-y', '@vespermcp/mcp-server@latest', '--version'], {
160
+ timeout: 10000,
161
+ encoding: 'utf8',
162
+ stdio: ['pipe', 'pipe', 'pipe'],
163
+ });
164
+ return result.status === 0 || (result.stderr && result.stderr.includes('Vesper'));
165
+ } catch {
166
+ return false;
167
+ }
168
+ }
169
+
170
+ // ── Main Wizard ──────────────────────────────────────────────
171
+ async function main() {
172
+ printBanner();
173
+
174
+ console.log(` ${green('→')} Setting up Vesper on ${bold(os.hostname())}\n`);
175
+
176
+ // ─── Step 1: Create directories ────────────────────────────
177
+ process.stdout.write(` ${dim('[')}${cyan('1/6')}${dim(']')} Creating local directories...`);
178
+ ensureDir(VESPER_DIR);
179
+ ensureDir(DATA_DIR);
180
+ ensureDir(path.join(DATA_DIR, 'raw'));
181
+ ensureDir(path.join(DATA_DIR, 'processed'));
182
+ ensureDir(path.join(VESPER_DIR, 'datasets'));
183
+ console.log(` ${green('āœ“')}`);
184
+
185
+ // ─── Step 2: Generate local API key ────────────────────────
186
+ process.stdout.write(` ${dim('[')}${cyan('2/6')}${dim(']')} Generating local API key...`);
187
+ const existing = readToml(CONFIG_TOML);
188
+ const localKey = existing.api_key || generateLocalKey();
189
+ const configData = { ...existing, api_key: localKey };
190
+ writeToml(CONFIG_TOML, configData);
191
+ console.log(` ${green('āœ“')}`);
192
+ console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 20) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
193
+
194
+ // ─── Step 3: Local vault initialization ────────────────────
195
+ process.stdout.write(`\n ${dim('[')}${cyan('3/6')}${dim(']')} Initializing local credentials vault...`);
196
+ configData.auth_mode = configData.auth_mode || 'local_unified';
197
+ writeToml(CONFIG_TOML, configData);
198
+ console.log(` ${green('āœ“')}`);
199
+ console.log(` ${dim('Mode:')} ${dim('single local Vesper key (no external keys required)')}`);
200
+
201
+ // ─── Step 4: Install @vespermcp/mcp-server ─────────────────
202
+ console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
203
+ try {
204
+ const npmCmd = IS_WIN ? 'npx.cmd' : 'npx';
205
+ spawnSync(npmCmd, ['-y', '@vespermcp/mcp-server@latest', '--setup', '--silent'], {
206
+ stdio: 'inherit',
207
+ timeout: 120000,
208
+ });
209
+ console.log(` ${green('āœ“')} @vespermcp/mcp-server installed`);
210
+ } catch {
211
+ console.log(` ${yellow('⚠')} Could not auto-install — run manually: npx @vespermcp/mcp-server --setup`);
212
+ }
213
+
214
+ // ─── Step 5: Auto-configure all detected IDEs ──────────────
215
+ process.stdout.write(`\n ${dim('[')}${cyan('5/6')}${dim(']')} Configuring coding agents...`);
216
+ const agents = getAllAgentConfigs();
217
+ const configuredAgents = [];
218
+ const skippedAgents = [];
219
+
220
+ for (const agent of agents) {
221
+ const dirExists = fs.existsSync(path.dirname(agent.path));
222
+ const fileExists = fs.existsSync(agent.path);
223
+ if (fileExists || dirExists) {
224
+ const ok = installMcpToAgent(agent);
225
+ if (ok) configuredAgents.push(agent.name);
226
+ else skippedAgents.push(agent.name);
227
+ }
228
+ }
229
+ console.log(` ${green('āœ“')}`);
230
+
231
+ if (configuredAgents.length > 0) {
232
+ console.log(`\n ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”`);
233
+ console.log(` │ ${bold('MCP Auto-Configured')} │`);
234
+ console.log(` ā”œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¤`);
235
+ for (const name of configuredAgents) {
236
+ console.log(` │ ${green('āœ“')} ${name.padEnd(42)}│`);
237
+ }
238
+ console.log(` ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜`);
239
+ }
240
+
241
+ // ─── Step 6: Verify ────────────────────────────────────────
242
+ console.log(`\n ${dim('[')}${cyan('6/6')}${dim(']')} Verifying installation...`);
243
+
244
+ const dbExists = fs.existsSync(path.join(DATA_DIR, 'metadata.db'));
245
+ const vecExists = fs.existsSync(path.join(DATA_DIR, 'vectors.json')) || fs.existsSync(path.join(DATA_DIR, 'vectors.bin'));
246
+ const keyStored = fs.existsSync(CONFIG_TOML);
247
+
248
+ console.log(` ${keyStored ? green('āœ“') : red('āœ—')} Local API key ${dim(CONFIG_TOML)}`);
249
+ console.log(` ${dbExists ? green('āœ“') : yellow('⚠')} Dataset index ${dim(dbExists ? 'ready' : 'will build on first search')}`);
250
+ console.log(` ${vecExists ? green('āœ“') : yellow('⚠')} Vector store ${dim(vecExists ? 'ready' : 'will build on first search')}`);
251
+ console.log(` ${configuredAgents.length > 0 ? green('āœ“') : yellow('⚠')} MCP agents ${dim(configuredAgents.length + ' configured')}`);
252
+
253
+ // ─── Final Summary ─────────────────────────────────────────
254
+ console.log(`
255
+ ${dim('═════════════════════════════════════════════════')}
256
+
257
+ ${green(bold('āœ“ Vesper is ready!'))}
258
+
259
+ ${bold('Your local API key:')}
260
+ ${cyan(localKey)}
261
+
262
+ ${bold('Config file:')}
263
+ ${dim(CONFIG_TOML)}
264
+
265
+ ${bold('What just happened:')}
266
+ ${dim('1.')} Generated a local API key (never leaves your machine)
267
+ ${dim('2.')} Initialized local credentials vault
268
+ ${dim('3.')} Auto-configured MCP for ${configuredAgents.length > 0 ? configuredAgents.join(', ') : 'detected agents'}
269
+ ${dim('4.')} Vesper server ready on stdio transport
270
+
271
+ ${dim('─────────────────────────────────────────────────')}
272
+
273
+ ${bold('Quick start — try in your AI assistant:')}
274
+
275
+ ${cyan('Search datasets')}
276
+ ${dim('>')} vesper_search(query="sentiment analysis")
277
+
278
+ ${cyan('Download & prepare')}
279
+ ${dim('>')} prepare_dataset(query="image classification cats dogs")
280
+
281
+ ${cyan('Quality analysis')}
282
+ ${dim('>')} analyze_quality(dataset_id="imdb")
283
+
284
+ ${cyan('Export to your project')}
285
+ ${dim('>')} export_dataset(dataset_id="imdb", format="parquet")
286
+
287
+ ${dim('─────────────────────────────────────────────────')}
288
+
289
+ ${bold('Unified API — one interface, every source:')}
290
+ HuggingFace Ā· Kaggle Ā· OpenML Ā· data.world
291
+
292
+ ${dim('Agents call localhost Vesper APIs with one local key.')}
293
+ ${dim('Vesper adapters handle provider routing internally.')}
294
+
295
+ ${dim('─────────────────────────────────────────────────')}
296
+
297
+ ${yellow('→')} Restart your IDE to activate MCP
298
+ ${dim('Docs:')} https://github.com/vesper/mcp-server
299
+
300
+ ${dim('═════════════════════════════════════════════════')}
301
+ `);
302
+ }
303
+
304
+ main().catch((err) => {
305
+ console.error(`\n${red('Error:')} ${err.message || err}`);
306
+ process.exit(1);
307
+ });
@@ -31,6 +31,19 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
31
31
  ext = os.path.splitext(file_path)[1].lower()
32
32
  if ext == ".csv":
33
33
  df = pl.read_csv(file_path, ignore_errors=True)
34
+ elif ext == ".tsv":
35
+ df = pl.read_csv(file_path, separator="\t", ignore_errors=True)
36
+ elif ext == ".txt":
37
+ # Heuristic delimiter detection for plain text tabular files.
38
+ sep = ","
39
+ try:
40
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
41
+ first_line = fh.readline()
42
+ if "\t" in first_line:
43
+ sep = "\t"
44
+ except Exception:
45
+ sep = ","
46
+ df = pl.read_csv(file_path, separator=sep, ignore_errors=True)
34
47
  elif ext in (".parquet", ".pq"):
35
48
  df = pl.read_parquet(file_path)
36
49
  elif ext in (".feather", ".ftr", ".arrow", ".ipc"):
@@ -40,6 +53,9 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
40
53
  else:
41
54
  raise ValueError(f"Unsupported input format: {ext}")
42
55
 
56
+ if len(df) == 0:
57
+ raise ValueError("empty CSV")
58
+
43
59
  # Column selection (before sampling for speed)
44
60
  if columns:
45
61
  valid = [c for c in columns if c in df.columns]
@@ -102,6 +102,18 @@ def main():
102
102
  file_path_lower = file_path.lower()
103
103
  if file_path_lower.endswith(".csv"):
104
104
  df = pl.read_csv(file_path, ignore_errors=True, n_rows=10000)
105
+ elif file_path_lower.endswith(".tsv"):
106
+ df = pl.read_csv(file_path, separator="\t", ignore_errors=True, n_rows=10000)
107
+ elif file_path_lower.endswith(".txt"):
108
+ sep = ","
109
+ try:
110
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
111
+ first_line = fh.readline()
112
+ if "\t" in first_line:
113
+ sep = "\t"
114
+ except Exception:
115
+ sep = ","
116
+ df = pl.read_csv(file_path, separator=sep, ignore_errors=True, n_rows=10000)
105
117
  elif file_path_lower.endswith(".parquet"):
106
118
  try:
107
119
  # Try scanning first (faster for large files)
@@ -133,10 +145,18 @@ def main():
133
145
  column_count = len(df.columns)
134
146
 
135
147
  # Duplicate detection (exact)
148
+ # NOTE: Some Polars versions can panic on is_duplicated() for nested/null rows.
149
+ # Use a Python fallback that is slower but robust for the 10k sampled rows.
150
+ duplicate_count = 0
136
151
  try:
137
- duplicate_count = df.is_duplicated().sum()
152
+ seen = set()
153
+ for row in df.to_dicts():
154
+ row_key = json.dumps(row, sort_keys=True, default=str)
155
+ if row_key in seen:
156
+ duplicate_count += 1
157
+ else:
158
+ seen.add(row_key)
138
159
  except Exception:
139
- # Duplicate check might fail on complex nested types (List, Struct)
140
160
  duplicate_count = 0
141
161
 
142
162
  columns_stats = []
@@ -165,12 +185,16 @@ def main():
165
185
  if duplicate_count == 0 and len(text_cols) > 0:
166
186
  # Pick longest text column as likely "content"
167
187
  # In real impl, we'd use heuristics. For now, first text col.
168
- target_col = text_cols[0]
169
- text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
170
- if text_dupes > 0:
171
- report["text_duplicates"] = int(text_dupes)
172
- if text_dupes > (row_count * 0.2):
173
- report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
188
+ target_col = text_cols[0]
189
+ try:
190
+ text_dupes = df.select(pl.col(target_col)).is_duplicated().sum()
191
+ if text_dupes > 0:
192
+ report["text_duplicates"] = int(text_dupes)
193
+ if text_dupes > (row_count * 0.2):
194
+ report["warnings"].append(f"High text duplication in '{target_col}' ({text_dupes} rows)")
195
+ except Exception:
196
+ # Skip text duplicate warning if backend cannot compute duplicates for this dtype
197
+ pass
174
198
 
175
199
  # Integrity Check 2: Contamination / Leakage (Basic)
176
200
  # (Skipping correlation for now)
@@ -1,77 +0,0 @@
1
- #!/usr/bin/env node
2
-
3
- // Vesper Wizard CLI: Interactive setup for fast configuration
4
- const inquirer = require('inquirer');
5
- const fs = require('fs');
6
- const path = require('path');
7
-
8
- async function main() {
9
- console.log('\nšŸ§™ Welcome to the Vesper Wizard!\n');
10
-
11
- // Step 1: Project basics
12
- const { projectName } = await inquirer.prompt([
13
- {
14
- type: 'input',
15
- name: 'projectName',
16
- message: 'Project name:',
17
- default: path.basename(process.cwd()),
18
- },
19
- ]);
20
-
21
- // Step 2: Data directory
22
- const { dataDir } = await inquirer.prompt([
23
- {
24
- type: 'input',
25
- name: 'dataDir',
26
- message: 'Path to your data directory:',
27
- default: './datasets',
28
- },
29
- ]);
30
-
31
- // Step 3: Default export format
32
- const { exportFormat } = await inquirer.prompt([
33
- {
34
- type: 'list',
35
- name: 'exportFormat',
36
- message: 'Default export format:',
37
- choices: ['parquet', 'csv', 'feather'],
38
- default: 'parquet',
39
- },
40
- ]);
41
-
42
- // Step 4: Add tokens/credentials
43
- const { addTokens } = await inquirer.prompt([
44
- {
45
- type: 'confirm',
46
- name: 'addTokens',
47
- message: 'Would you like to add API tokens or credentials now?',
48
- default: true,
49
- },
50
- ]);
51
- let tokens = {};
52
- if (addTokens) {
53
- const { kaggleToken } = await inquirer.prompt([
54
- {
55
- type: 'input',
56
- name: 'kaggleToken',
57
- message: 'Kaggle API token (leave blank to skip):',
58
- },
59
- ]);
60
- if (kaggleToken) tokens.kaggle = kaggleToken;
61
- // Add more tokens as needed
62
- }
63
-
64
- // Step 5: Write config file
65
- const config = {
66
- project: projectName,
67
- dataDir,
68
- exportFormat,
69
- tokens,
70
- };
71
- const configPath = path.join(process.cwd(), 'vesper-mcp-config.json');
72
- fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
73
- console.log(`\nāœ… Configuration saved to ${configPath}`);
74
- console.log('\nšŸŽ‰ Vesper is ready to use!\n');
75
- }
76
-
77
- main();