@vespermcp/mcp-server 1.0.5 → 1.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +6 -4
  2. package/build/cleaning/cleaner.js +27 -2
  3. package/build/cleaning/executor.js +7 -6
  4. package/build/cleaning/planner.js +16 -4
  5. package/build/config/config-manager.js +215 -0
  6. package/build/export/exporter.js +26 -2
  7. package/build/index.js +273 -92
  8. package/build/ingestion/ingestor.js +5 -22
  9. package/build/install/install-service.js +1 -1
  10. package/build/jobs/manager.js +17 -10
  11. package/build/metadata/monitoring-service.js +2 -2
  12. package/build/metadata/scraper.js +8 -8
  13. package/build/metadata/store.js +17 -2
  14. package/build/monitoring/observability.js +2 -2
  15. package/build/preparation/target-detector.js +75 -0
  16. package/build/python/cleaner.py +226 -0
  17. package/build/python/export_engine.py +131 -0
  18. package/build/python/framework_adapters.py +100 -0
  19. package/build/python/github_adapter.py +106 -0
  20. package/build/python/image_engine.py +86 -0
  21. package/build/python/media_engine.py +133 -0
  22. package/build/python/nasa_adapter.py +82 -0
  23. package/build/python/quality_engine.py +243 -0
  24. package/build/python/splitter_engine.py +283 -0
  25. package/build/python/target_engine.py +154 -0
  26. package/build/python/test_framework_adapters.py +61 -0
  27. package/build/python/uci_adapter.py +94 -0
  28. package/build/python/worldbank_adapter.py +99 -0
  29. package/build/quality/analyzer.js +40 -4
  30. package/build/quality/image-analyzer.js +28 -2
  31. package/build/quality/media-analyzer.js +28 -2
  32. package/build/scripts/cleanup-kaggle.js +41 -0
  33. package/build/scripts/repro-bug.js +37 -0
  34. package/build/scripts/repro-export-bug.js +56 -0
  35. package/build/scripts/test-mcp-v5.js +12 -11
  36. package/build/scripts/test-production-sync.js +36 -0
  37. package/build/scripts/test-target-detector.js +29 -0
  38. package/build/scripts/test-write.js +14 -0
  39. package/build/scripts/verify-integration.js +57 -0
  40. package/build/scripts/verify-priority.js +33 -0
  41. package/build/search/engine.js +13 -2
  42. package/build/search/jit-orchestrator.js +6 -40
  43. package/build/search/vector-store.js +18 -0
  44. package/build/splitting/splitter.js +27 -2
  45. package/build/tools/formatter.js +15 -6
  46. package/build/utils/downloader.js +2 -2
  47. package/build/utils/selector.js +69 -0
  48. package/package.json +8 -4
  49. package/src/python/cleaner.py +33 -3
  50. package/src/python/export_engine.py +19 -0
  51. package/src/python/target_engine.py +154 -0
package/build/index.js CHANGED
@@ -23,30 +23,98 @@ import { CacheService, MockRedisProvider } from "./cache/service.js";
23
23
  import { ImageAnalyzer } from "./quality/image-analyzer.js";
24
24
  import { MediaAnalyzer } from "./quality/media-analyzer.js";
25
25
  import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
26
+ import { ConfigManager } from "./config/config-manager.js";
27
+ import { Selector } from "./utils/selector.js";
28
+ import os from "os";
26
29
  // Determine absolute paths relative to the compiled script
27
30
  const __filename = fileURLToPath(import.meta.url);
28
31
  const __dirname = path.dirname(__filename);
29
- // We are in /build/index.js, so project root is one level up
30
- const projectRoot = path.join(__dirname, "..");
31
- const dbPath = path.join(projectRoot, "data", "metadata.db");
32
- const vectorPath = path.join(projectRoot, "data", "vectors.json");
32
+ // appRoot: Where the source code/scripts are (inside node_modules or source)
33
+ const appRoot = path.join(__dirname, "..");
34
+ // dataRoot: Where database and user data live (in user home)
35
+ // Use os.homedir() as it's more reliable than env vars
36
+ const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || appRoot;
37
+ const dataRoot = path.join(homeDir, ".vesper");
38
+ // Ensure data directory exists
39
+ if (!fs.existsSync(dataRoot))
40
+ fs.mkdirSync(dataRoot, { recursive: true });
41
+ const dbPath = path.join(dataRoot, "data", "metadata.db");
42
+ const vectorPath = path.join(dataRoot, "data", "vectors.json");
43
+ const errorLogPath = path.join(dataRoot, "vesper_errors.log");
44
+ console.error(`[Vesper] Data directory: ${dataRoot}`);
45
+ console.error(`[Vesper] Database path: ${dbPath}`);
46
+ function logError(err, context) {
47
+ const timestamp = new Date().toISOString();
48
+ const stack = err.stack || String(err);
49
+ const msg = `[${timestamp}] ERROR in ${context}:\n${stack}\n${"-".repeat(50)}\n`;
50
+ fs.appendFileSync(errorLogPath, msg);
51
+ console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
52
+ }
53
+ /**
54
+ * Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
55
+ */
56
+ function syncPythonScripts(appRoot, dataRoot) {
57
+ const pythonDest = path.join(dataRoot, "python");
58
+ if (!fs.existsSync(pythonDest))
59
+ fs.mkdirSync(pythonDest, { recursive: true });
60
+ // Sources to check for Python scripts
61
+ const sources = [
62
+ path.join(appRoot, "src", "python"),
63
+ path.join(appRoot, "build", "python"),
64
+ path.join(appRoot, "python")
65
+ ];
66
+ let syncedCount = 0;
67
+ for (const src of sources) {
68
+ if (fs.existsSync(src)) {
69
+ const files = fs.readdirSync(src);
70
+ for (const file of files) {
71
+ if (file.endsWith(".py")) {
72
+ const srcPath = path.join(src, file);
73
+ const destPath = path.join(pythonDest, file);
74
+ // Only copy if file doesn't exist or is different size (basic sync)
75
+ const srcStat = fs.statSync(srcPath);
76
+ let shouldCopy = true;
77
+ if (fs.existsSync(destPath)) {
78
+ const destStat = fs.statSync(destPath);
79
+ if (srcStat.size === destStat.size)
80
+ shouldCopy = false;
81
+ }
82
+ if (shouldCopy) {
83
+ fs.copyFileSync(srcPath, destPath);
84
+ syncedCount++;
85
+ }
86
+ }
87
+ }
88
+ }
89
+ }
90
+ if (syncedCount > 0) {
91
+ console.error(`[Vesper] Synced ${syncedCount} Python scripts to ${pythonDest}`);
92
+ }
93
+ }
94
+ // Sync scripts immediately
95
+ syncPythonScripts(appRoot, dataRoot);
33
96
  const metadataStore = new MetadataStore(dbPath);
34
97
  const vectorStore = new VectorStore(vectorPath);
35
98
  const embedder = Embedder.getInstance();
36
99
  const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
37
100
  const jobManager = JobManager.getInstance(metadataStore);
38
- const dataIngestor = new DataIngestor(projectRoot, metadataStore);
39
- const installService = new InstallService(projectRoot, metadataStore);
101
+ // Use dataRoot for storage services (persistence)
102
+ const dataIngestor = new DataIngestor(dataRoot, metadataStore);
103
+ const installService = new InstallService(dataRoot, metadataStore);
40
104
  const cacheService = new CacheService(new MockRedisProvider());
41
- const qualityAnalyzer = new QualityAnalyzer(cacheService, projectRoot);
42
- const cleaningPlanner = new CleaningPlanner(cacheService);
43
- const dataCleaner = new DataCleaner(projectRoot);
44
- const pipelineExecutor = new PipelineExecutor(projectRoot);
45
- const dataSplitter = new DataSplitter(projectRoot);
46
- const dataExporter = new DataExporter(projectRoot);
47
- const imageAnalyzer = new ImageAnalyzer(projectRoot);
48
- const mediaAnalyzer = new MediaAnalyzer(projectRoot);
49
- const qualityOrchestrator = new QualityOrchestrator(projectRoot);
105
+ const dataCleaner = new DataCleaner(__dirname);
106
+ const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
107
+ const dataSplitter = new DataSplitter(__dirname);
108
+ const dataExporter = new DataExporter(__dirname);
109
+ // CRITICAL FIX: Pass __dirname (build directory) to analyzers
110
+ // Python scripts are in build/python/, so analyzers should look relative to build/
111
+ // NOT relative to project root (appRoot)
112
+ process.env.PYTHONIOENCODING = "utf-8";
113
+ const qualityAnalyzer = new QualityAnalyzer(cacheService, __dirname);
114
+ const cleaningPlanner = new CleaningPlanner(cacheService, __dirname); // Pass __dirname for TargetDetector
115
+ const imageAnalyzer = new ImageAnalyzer(__dirname);
116
+ const mediaAnalyzer = new MediaAnalyzer(__dirname);
117
+ const qualityOrchestrator = new QualityOrchestrator(__dirname);
50
118
  // Subscribe to job updates for real-time streaming to the UI
51
119
  jobManager.on("jobUpdated", (job) => {
52
120
  const level = job.status === "failed" ? "error" : "info";
@@ -58,12 +126,91 @@ jobManager.on("jobUpdated", (job) => {
58
126
  });
59
127
  });
60
128
  // IMPORTANT: Execute jobs when the manager emits them
61
- // This connects the queue logic to the execution context
62
129
  jobManager.on("processJob", async (job, execute) => {
63
- // The JobManager controls concurrency, so if we receive this event,
64
- // we should execute the job immediately.
65
- await execute();
130
+ console.error(`[Vesper] Listener RECEIVED job: ${job?.id}, execute type: ${typeof execute}`);
131
+ if (typeof execute !== 'function') {
132
+ console.error(`[CRITICAL] execute is NOT a function! It is: ${typeof execute}`);
133
+ logError(new Error(`execute is ${typeof execute}`), "listener:execute_check");
134
+ return;
135
+ }
136
+ const prepareDatasetTask = async () => {
137
+ console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
138
+ const metadata = job.metadata ? JSON.parse(job.metadata) : {};
139
+ switch (job.type) {
140
+ case "prepare": return await handlePrepareJob(job.id, metadata.query);
141
+ case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
142
+ default: throw new Error(`Unhandled job type: ${job.type}`);
143
+ }
144
+ };
145
+ try {
146
+ console.error(`[Vesper] Calling execute(prepareDatasetTask) for ${job.id}...`);
147
+ await execute(prepareDatasetTask);
148
+ console.error(`[Vesper] execute(prepareDatasetTask) COMPLETED for ${job.id}`);
149
+ }
150
+ catch (e) {
151
+ logError(e, `processJob:${job.type}:${job.id}`);
152
+ console.error(`[Vesper] Error in execute wrapper for ${job.id}: ${e.message}`);
153
+ }
66
154
  });
155
+ /**
156
+ * Logic for preparing a dataset (Search + Ingest + Process)
157
+ */
158
+ async function handlePrepareJob(jobId, query) {
159
+ const update = (updates) => jobManager.updateJob(jobId, updates);
160
+ update({ progress: 10, status_text: "Searching for best dataset matching query..." });
161
+ const results = await searchEngine.search(query, { limit: 1 });
162
+ if (results.length === 0) {
163
+ throw new Error("No datasets found matching the query. Try refining your search terms.");
164
+ }
165
+ const topDataset = results[0];
166
+ update({
167
+ progress: 20,
168
+ status_text: `Matched: ${topDataset.name} (${topDataset.source})`
169
+ });
170
+ const source = topDataset.source;
171
+ // Pre-check credentials for Kaggle
172
+ if (source === "kaggle") {
173
+ if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
174
+ process.env.KAGGLE_USERNAME === "YOUR_KAGGLE_USERNAME") {
175
+ throw new Error("Kaggle credentials not set. Use 'kaggle login' or set KAGGLE_USERNAME/KAGGLE_KEY.");
176
+ }
177
+ }
178
+ update({ progress: 30, status_text: `Starting download from ${source}...` });
179
+ // ensureData handles download and returns path to the raw file
180
+ const rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
181
+ update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
182
+ });
183
+ update({ progress: 70, status_text: "Analyzing dataset quality..." });
184
+ const report = await qualityAnalyzer.analyze(rawFilePath);
185
+ // Update local metadata with quality info
186
+ metadataStore.saveDataset({
187
+ ...topDataset,
188
+ quality_score: report.overall_score
189
+ });
190
+ update({ progress: 85, status_text: "Installing dataset into project..." });
191
+ const installPath = await installService.install(topDataset.id, rawFilePath);
192
+ update({ progress: 100, status_text: "Preparation complete!" });
193
+ return installPath;
194
+ }
195
+ /**
196
+ * Logic for cleaning a dataset
197
+ */
198
+ async function handleCleanJob(jobId, datasetId, ops) {
199
+ const update = (updates) => jobManager.updateJob(jobId, updates);
200
+ let filePath = path.join(dataRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
201
+ if (datasetId === "demo" || !fs.existsSync(filePath)) {
202
+ const demoPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
203
+ if (fs.existsSync(demoPath))
204
+ filePath = demoPath;
205
+ else
206
+ throw new Error(`Data file not found for ${datasetId}`);
207
+ }
208
+ update({ status_text: "Cleaning dataset..." });
209
+ const result = await dataCleaner.clean(filePath, ops);
210
+ if (!result.success)
211
+ throw new Error(result.error);
212
+ return result.output_path;
213
+ }
67
214
  // Create the server
68
215
  const server = new Server({
69
216
  name: "vesper",
@@ -308,10 +455,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
308
455
  }
309
456
  case "analyze_quality": {
310
457
  const datasetId = String(request.params.arguments?.dataset_id);
311
- let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
458
+ let filePath = path.join(dataRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
312
459
  // Demo Fallback for easy testing
313
460
  if (datasetId === "demo" || !fs.existsSync(filePath)) {
314
- const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
461
+ const demoPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
315
462
  if (fs.existsSync(demoPath)) {
316
463
  filePath = demoPath;
317
464
  }
@@ -329,9 +476,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
329
476
  }
330
477
  case "preview_cleaning": {
331
478
  const datasetId = String(request.params.arguments?.dataset_id);
332
- let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
479
+ let filePath = path.join(dataRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
333
480
  if (datasetId === "demo" || !fs.existsSync(filePath)) {
334
- const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
481
+ const demoPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
335
482
  if (fs.existsSync(demoPath)) {
336
483
  filePath = demoPath;
337
484
  }
@@ -340,8 +487,43 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
340
487
  }
341
488
  }
342
489
  const report = await qualityAnalyzer.analyze(filePath);
343
- const plan = await cleaningPlanner.generatePlan(datasetId, report);
490
+ // Phase 1: Target Detection
491
+ // We use the same TargetDetector instance inside CleaningPlanner now?
492
+ // Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
493
+ // OR let the planner handle it if we update its signature to accept filePath.
494
+ // Let's check `CleaningPlanner.generatePlan` signature again.
495
+ // We updated it to accept `targetInfo`.
496
+ // So we need to run detection HERE and pass it.
497
+ // But `TargetDetector` is not exposed in `index.ts` scope yet.
498
+ // Let's create a global instance or use the one inside planner if exposed (it's private).
499
+ // Better approach: Instantiate TargetDetector here in index.ts for the tool content.
500
+ // Quick fix: Instantiate local detector or make global.
501
+ // I'll make a global `targetDetector` constant in index.ts
502
+ // But wait, I updated `CleaningPlanner` to instantiate its own detector.
503
+ // Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
504
+ // RETRY STRATEGY:
505
+ // 1. Instantiate `targetDetector` in `index.ts`.
506
+ // 2. Run `detectTarget(filePath)`.
507
+ // 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
508
+ // I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
509
+ // But since I'm in this tool, I can't look back.
510
+ // I will assume I can add it, or just do it inside the case for now.
511
+ // To do it properly, I should have added `targetDetector` to the global scope in previous step.
512
+ // Let's do that in a separate step if needed.
513
+ // For now, I'll instantiate it here.
514
+ const { TargetDetector } = await import("./preparation/target-detector.js");
515
+ const detector = new TargetDetector(__dirname);
516
+ const targetResult = await detector.detectTarget(filePath);
517
+ const targetInfo = targetResult.target_column ? {
518
+ target: targetResult.target_column,
519
+ confidence: targetResult.confidence
520
+ } : undefined;
521
+ const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
344
522
  let explanation = `### 📋 Cleaning Plan for ${datasetId}\n\n`;
523
+ if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
524
+ explanation += `🎯 **Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
525
+ explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
526
+ }
345
527
  explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
346
528
  if (plan.operations.length === 0) {
347
529
  explanation += "✅ No cleaning operations required.";
@@ -358,25 +540,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
358
540
  case "custom_clean": {
359
541
  const datasetId = String(request.params.arguments?.dataset_id);
360
542
  const ops = request.params.arguments?.operations;
361
- let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
362
- if (datasetId === "demo" || !fs.existsSync(filePath)) {
363
- const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
364
- if (fs.existsSync(demoPath)) {
365
- filePath = demoPath;
366
- }
367
- else {
368
- throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}`);
369
- }
370
- }
371
543
  const job = jobManager.createJob("clean", 0, { datasetId, ops });
372
- // Run in background
373
- jobManager.runJob(job.id, async (update) => {
374
- update({ status_text: "Cleaning dataset..." });
375
- const result = await dataCleaner.clean(filePath, ops);
376
- if (!result.success)
377
- throw new Error(result.error);
378
- return result.output_path;
379
- });
380
544
  return {
381
545
  content: [{ type: "text", text: `Job started successfully. ID: ${job.id}. Use check_job_status to monitor progress.` }]
382
546
  };
@@ -384,56 +548,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
384
548
  case "prepare_dataset": {
385
549
  const query = String(request.params.arguments?.query);
386
550
  const job = jobManager.createJob("prepare", 0, { query });
387
- // Orchestrated Background Task
388
- jobManager.runJob(job.id, async (update) => {
389
- try {
390
- update({ progress: 10, status_text: "Searching for best dataset matching query..." });
391
- const results = await searchEngine.search(query, { limit: 1 });
392
- if (results.length === 0) {
393
- throw new Error("No datasets found matching the query. Try refining your search terms.");
394
- }
395
- const topDataset = results[0];
396
- // Phase 6: Real Ingestion
397
- update({
398
- progress: 20,
399
- status_text: `Matched: ${topDataset.name} (${topDataset.source})`
400
- });
401
- const source = topDataset.source;
402
- // Pre-check credentials for Kaggle to fail fast with helpful message
403
- if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
404
- const errorMsg = dataIngestor.getKaggleCredentialError();
405
- throw new Error(errorMsg);
406
- }
407
- const filePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
408
- update({
409
- status_text: msg,
410
- progress: prog !== undefined ? 20 + Math.floor(prog * 0.3) : undefined // 20% -> 50%
411
- });
412
- });
413
- update({ progress: 55, status_text: "Analyzing dataset quality..." });
414
- const quality = await qualityAnalyzer.analyze(filePath);
415
- const pipelineResult = await pipelineExecutor.runPipeline(topDataset.id, filePath, "csv", (msg) => {
416
- update({ status_text: msg });
417
- });
418
- update({ progress: 90, status_text: "Installing dataset into codebase..." });
419
- const installPath = await installService.install(topDataset.id, pipelineResult.final_output_path);
420
- update({ progress: 100, status_text: "Preparation complete!" });
421
- const message = `✅ Preparation complete for ${topDataset.name}.\n` +
422
- `📦 Dataset installed to: ${installPath}\n` +
423
- `🚀 You can now use this dataset for training your models.`;
424
- return message;
425
- }
426
- catch (error) {
427
- // Re-throw with enhanced error message for credential issues
428
- const errorMessage = error.message || String(error);
429
- if (errorMessage.includes("Kaggle credentials") || errorMessage.includes("KAGGLE")) {
430
- throw new Error(errorMessage);
431
- }
432
- throw error;
433
- }
434
- });
435
551
  return {
436
- content: [{ type: "text", text: `Autonomous preparation job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
552
+ content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
437
553
  };
438
554
  }
439
555
  case "compare_datasets": {
@@ -479,7 +595,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
479
595
  // Check if we need conversion
480
596
  const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
481
597
  if (currentExt !== requestedFormat) {
482
- console.log(`[Export] Format mismatch (${currentExt} vs ${requestedFormat}). Converting...`);
598
+ console.error(`[Export] Format mismatch (${currentExt} vs ${requestedFormat}). Converting...`);
483
599
  try {
484
600
  const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
485
601
  sourcePath = pipelineResult.final_output_path;
@@ -645,9 +761,74 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
645
761
  }
646
762
  });
647
763
  async function main() {
764
+ const args = process.argv.slice(2);
765
+ const isSetup = args.includes("--setup") || args.includes("setup");
766
+ const isSilent = args.includes("--silent");
767
+ // If run in setup mode OR in a terminal without args (human call), show setup wizard
768
+ if (isSetup || (process.stdin.isTTY && args.length === 0)) {
769
+ await runSetupWizard(isSilent);
770
+ return;
771
+ }
772
+ // Otherwise proceed to server mode (for IDEs/Agents)
773
+ console.error(`[Vesper] Starting server...`);
774
+ console.error(`[Vesper] dataRoot: ${dataRoot}`);
775
+ console.error(`[Vesper] dbPath: ${dbPath}`);
648
776
  const transport = new StdioServerTransport();
649
777
  await server.connect(transport);
650
778
  console.error("Vesper MCP server running on stdio");
779
+ console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
780
+ console.log("[Vesper] Main loop finished");
781
+ }
782
+ async function runSetupWizard(silent = false) {
783
+ const configManager = new ConfigManager();
784
+ if (!silent) {
785
+ console.log(`\n🚀 Welcome to Vesper MCP Setup!`);
786
+ console.log(`--------------------------------`);
787
+ console.log(`Searching for IDE configurations...`);
788
+ }
789
+ const ides = configManager.detectIDEs();
790
+ if (ides.length === 0) {
791
+ if (!silent) {
792
+ console.log("\n❌ No supported IDEs detected.");
793
+ console.log("I checked for:");
794
+ console.log(" - Cursor (Global & Project)");
795
+ console.log(" - Claude Desktop");
796
+ console.log(" - VS Code (Standard MCP, Copilot Chat, Cline, Roo Code)");
797
+ console.log("\nIf you are using VS Code or Cursor, please make sure they are installed.");
798
+ console.log("For project-specific setup, run this command inside your project folder.");
799
+ }
800
+ return;
801
+ }
802
+ if (silent) {
803
+ for (const ide of ides) {
804
+ await configManager.installTo(ide);
805
+ }
806
+ return;
807
+ }
808
+ console.log(`\nFound ${ides.length} potential application(s):`);
809
+ const selector = new Selector("Select applications to configure for Vesper:", ides.map(ide => ({
810
+ name: ide.name,
811
+ value: ide,
812
+ selected: true
813
+ })));
814
+ const selectedIDEs = await selector.run();
815
+ if (selectedIDEs.length > 0) {
816
+ console.log(`\nInstalling to ${selectedIDEs.length} application(s)...\n`);
817
+ for (const ide of selectedIDEs) {
818
+ process.stdout.write(`Installing to ${ide.name}... `);
819
+ const success = await configManager.installTo(ide);
820
+ if (success) {
821
+ process.stdout.write("✅\n");
822
+ }
823
+ else {
824
+ process.stdout.write("❌ (Check permissions or if file is in use)\n");
825
+ }
826
+ }
827
+ console.log("\n✨ Setup complete! Please RESTART your IDE(s) to apply changes.");
828
+ }
829
+ else {
830
+ console.log("\nSetup skipped. No applications selected.");
831
+ }
651
832
  }
652
833
  main().catch((error) => {
653
834
  console.error("Server error:", error);
@@ -40,7 +40,7 @@ export class DataIngestor {
40
40
  return status.local_path;
41
41
  }
42
42
  if (status && status.status === 'downloading') {
43
- console.log(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
43
+ console.error(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
44
44
  // In a better system we'd use a lock, but for now we let it resume
45
45
  // the RobustDownloader handles the actual file locking/range logic.
46
46
  }
@@ -67,27 +67,10 @@ export class DataIngestor {
67
67
  }
68
68
  }
69
69
  else if (source === "kaggle") {
70
- // Check credentials early and provide helpful error
71
- if (!this.kaggleDownloader.hasCredentials()) {
72
- const errorMsg = this.kaggleDownloader.getCredentialError();
73
- this.failDownload(datasetId, errorMsg);
74
- throw new Error(errorMsg);
75
- }
76
- const safeId = datasetId.replace("kaggle:", "").replace(/\//g, "_");
77
- const targetDir = path.join(this.rawDataDir, `kaggle_${safeId}`);
78
- this.store.registerDownload(datasetId, targetDir, "downloading");
79
- try {
80
- const primaryFile = await this.kaggleDownloader.download(datasetId.replace("kaggle:", ""), targetDir, (progress) => {
81
- onProgress?.("Downloading Kaggle archive...", progress);
82
- });
83
- const stats = fs.statSync(primaryFile);
84
- this.completeDownload(datasetId, primaryFile, stats.size);
85
- return primaryFile;
86
- }
87
- catch (e) {
88
- this.failDownload(datasetId, e.message);
89
- throw e;
90
- }
70
+ // Kaggle support has been disabled
71
+ const errorMsg = "Kaggle datasets are no longer supported. Please use HuggingFace or other open-access sources.";
72
+ this.failDownload(datasetId, errorMsg);
73
+ throw new Error(errorMsg);
91
74
  }
92
75
  throw new Error(`Download logic for ${source} not yet implemented`);
93
76
  }
@@ -35,7 +35,7 @@ export class InstallService {
35
35
  // Update metadata
36
36
  const absolutePath = path.resolve(targetPath);
37
37
  this.metadataStore.updateInstallPath(datasetId, absolutePath);
38
- console.log(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
38
+ console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
39
39
  return absolutePath;
40
40
  }
41
41
  }
@@ -64,11 +64,22 @@ export class JobManager extends EventEmitter {
64
64
  this.activeWorkers++;
65
65
  this.updateJob(job.id, { status: "running", status_text: "Picked up by worker" });
66
66
  const startTime = Date.now();
67
+ const listeners = this.listenerCount("processJob");
68
+ console.error(`[JobManager] Emitting processJob for ${job.id}. Active listeners: ${listeners}`);
67
69
  // In a real system, we'd have a registry of handlers for each JobType.
68
70
  // For now, we emit an event so the orchestrator can run it.
69
- this.emit("processJob", job, async (task) => {
71
+ this.emit("processJob", job, async (jobExecutionTask) => {
72
+ console.error(`[JobManager] Wrapper received jobExecutionTask: ${typeof jobExecutionTask}`);
73
+ if (typeof jobExecutionTask !== 'function') {
74
+ console.error(`[JobManager] Error: jobExecutionTask is NOT a function! It is: ${typeof jobExecutionTask}`);
75
+ this.updateJob(job.id, {
76
+ status: "failed",
77
+ status_text: "Internal error: jobExecutionTask is not a function"
78
+ });
79
+ return;
80
+ }
70
81
  try {
71
- const resultUrl = await task();
82
+ const resultUrl = await jobExecutionTask();
72
83
  const duration = Date.now() - startTime;
73
84
  this.updateJob(job.id, {
74
85
  status: "completed",
@@ -103,14 +114,6 @@ export class JobManager extends EventEmitter {
103
114
  // Try to start another worker if capacity allows
104
115
  this.processQueue();
105
116
  }
106
- /**
107
- * Helper to run a task as a job with automatic status updates
108
- */
109
- runJob(id, task) {
110
- this.emit("processJob", { id }, async () => {
111
- return await task((updates) => this.updateJob(id, updates));
112
- });
113
- }
114
117
  /**
115
118
  * Update job status and progress
116
119
  */
@@ -118,6 +121,10 @@ export class JobManager extends EventEmitter {
118
121
  const job = this.store.getJob(id);
119
122
  if (!job)
120
123
  return;
124
+ // Correctly handle metadata update if it's an object
125
+ if (updates.metadata && typeof updates.metadata !== 'string') {
126
+ updates.metadata = JSON.stringify(updates.metadata);
127
+ }
121
128
  const updatedJob = {
122
129
  ...job,
123
130
  ...updates,
@@ -90,7 +90,7 @@ export class MonitoringService {
90
90
  }
91
91
  }
92
92
  async sendToWebhook(webhook, diff) {
93
- console.log(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
93
+ console.error(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
94
94
  // In a real implementation, this would be an HTTP POST
95
95
  // For now, we simulate the payload
96
96
  const payload = {
@@ -101,7 +101,7 @@ export class MonitoringService {
101
101
  // await axios.post(webhook.url, payload);
102
102
  }
103
103
  async triggerReprocess(datasetId) {
104
- console.log(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
104
+ console.error(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
105
105
  // This would call IngestionService or similar
106
106
  }
107
107
  }
@@ -8,8 +8,8 @@ export class HuggingFaceScraper {
8
8
  * Bulk discovery: Fetch many datasets quickly without deep details.
9
9
  * Hits the 25k target in minutes.
10
10
  */
11
- async scrapeBulk(limit = 1000, domainFilter) {
12
- const filterMsg = domainFilter ? `, domain: ${domainFilter}` : "";
11
+ async scrapeBulk(limit = 1000, query) {
12
+ const filterMsg = query ? `, query: ${query}` : "";
13
13
  console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
14
14
  const results = [];
15
15
  let processed = 0;
@@ -18,7 +18,7 @@ export class HuggingFaceScraper {
18
18
  for await (const ds of listDatasets({
19
19
  limit: limit,
20
20
  additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
21
- search: { query: domainFilter },
21
+ search: { query: query },
22
22
  ...(hfToken ? { accessToken: hfToken } : {})
23
23
  })) {
24
24
  if (results.length >= limit)
@@ -86,9 +86,9 @@ export class HuggingFaceScraper {
86
86
  }
87
87
  return results;
88
88
  }
89
- async scrape(limit = 100, applyMVPFilters = true, domainFilter // Optional: filter by domain (medicine, healthcare, security, etc.)
89
+ async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
90
90
  ) {
91
- const filterMsg = domainFilter ? `, domain: ${domainFilter}` : "";
91
+ const filterMsg = query ? `, query: ${query}` : "";
92
92
  console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
93
93
  const results = [];
94
94
  let processed = 0;
@@ -105,7 +105,7 @@ export class HuggingFaceScraper {
105
105
  for await (const ds of listDatasets({
106
106
  limit: fetchLimit,
107
107
  additionalFields: ["description", "tags"],
108
- search: { query: domainFilter },
108
+ search: { query: query },
109
109
  ...(hfToken ? { accessToken: hfToken } : {})
110
110
  })) {
111
111
  if (results.length >= limit)
@@ -192,8 +192,8 @@ export class HuggingFaceScraper {
192
192
  const columns = this.extractColumns(cardData, splits);
193
193
  const task = this.extractTask(tags);
194
194
  const domain = classifyDomain(description, tags, repoId, task);
195
- if (domainFilter && domain !== domainFilter)
196
- return;
195
+ // REMOVED strict domain filtering that caused search bias
196
+ // if (query && domain !== query) return;
197
197
  const metadata = {
198
198
  id: repoId,
199
199
  source: "huggingface",
@@ -17,7 +17,7 @@ export class MetadataStore {
17
17
  // Add install_path if missing
18
18
  try {
19
19
  this.db.exec("ALTER TABLE datasets ADD COLUMN install_path TEXT");
20
- console.log("[MetadataStore] Migrated: Added install_path column");
20
+ console.error("[MetadataStore] Migrated: Added install_path column");
21
21
  }
22
22
  catch (e) {
23
23
  // Probably already exists
@@ -315,10 +315,25 @@ export class MetadataStore {
315
315
  * Perform database maintenance (VACUUM, ANALYZE).
316
316
  */
317
317
  optimize() {
318
- console.log("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
318
+ console.error("[MetadataStore] Running database optimization (VACUUM, ANALYZE)...");
319
319
  this.db.exec("VACUUM");
320
320
  this.db.exec("ANALYZE");
321
321
  }
322
+ /**
323
+ * Delete all datasets from a specific source
324
+ */
325
+ deleteBySource(source) {
326
+ const info = this.db.prepare("DELETE FROM datasets WHERE source = ?").run(source);
327
+ console.error(`[MetadataStore] Deleted ${info.changes} datasets from source: ${source}`);
328
+ return info.changes;
329
+ }
330
+ /**
331
+ * Get all dataset IDs from a specific source
332
+ */
333
+ getDatasetIdsBySource(source) {
334
+ const rows = this.db.prepare("SELECT id FROM datasets WHERE source = ?").all(source);
335
+ return rows.map(r => r.id);
336
+ }
322
337
  close() {
323
338
  this.db.close();
324
339
  }