@vespermcp/mcp-server 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -4
- package/build/cleaning/cleaner.js +27 -2
- package/build/cleaning/executor.js +7 -6
- package/build/cleaning/planner.js +16 -4
- package/build/config/config-manager.js +199 -0
- package/build/export/exporter.js +26 -2
- package/build/index.js +272 -72
- package/build/ingestion/ingestor.js +17 -16
- package/build/ingestion/kaggle-downloader.js +25 -2
- package/build/install/install-service.js +1 -1
- package/build/jobs/manager.js +17 -10
- package/build/metadata/monitoring-service.js +2 -2
- package/build/metadata/scraper.js +8 -8
- package/build/metadata/store.js +17 -2
- package/build/monitoring/observability.js +2 -2
- package/build/preparation/target-detector.js +75 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/export_engine.py +131 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/quality_engine.py +243 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +40 -4
- package/build/quality/image-analyzer.js +73 -5
- package/build/quality/media-analyzer.js +74 -5
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/test-mcp-v5.js +12 -11
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/engine.js +13 -2
- package/build/search/jit-orchestrator.js +6 -40
- package/build/search/vector-store.js +18 -0
- package/build/splitting/splitter.js +27 -2
- package/build/tools/formatter.js +23 -8
- package/build/utils/downloader.js +2 -2
- package/build/utils/selector.js +69 -0
- package/package.json +8 -4
- package/src/python/cleaner.py +33 -3
- package/src/python/export_engine.py +19 -0
- package/src/python/target_engine.py +154 -0
package/build/index.js
CHANGED
|
@@ -23,30 +23,94 @@ import { CacheService, MockRedisProvider } from "./cache/service.js";
|
|
|
23
23
|
import { ImageAnalyzer } from "./quality/image-analyzer.js";
|
|
24
24
|
import { MediaAnalyzer } from "./quality/media-analyzer.js";
|
|
25
25
|
import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
26
|
+
import { ConfigManager } from "./config/config-manager.js";
|
|
27
|
+
import { Selector } from "./utils/selector.js";
|
|
26
28
|
// Determine absolute paths relative to the compiled script
|
|
27
29
|
const __filename = fileURLToPath(import.meta.url);
|
|
28
30
|
const __dirname = path.dirname(__filename);
|
|
29
|
-
//
|
|
30
|
-
const
|
|
31
|
-
|
|
32
|
-
const
|
|
31
|
+
// appRoot: Where the source code/scripts are (inside node_modules or source)
|
|
32
|
+
const appRoot = path.join(__dirname, "..");
|
|
33
|
+
// dataRoot: Where database and user data live (in user home) to prevent data loss on update and running from node_modules
|
|
34
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE || appRoot;
|
|
35
|
+
const dataRoot = path.join(homeDir, ".vesper");
|
|
36
|
+
// Ensure data directory exists
|
|
37
|
+
if (!fs.existsSync(dataRoot))
|
|
38
|
+
fs.mkdirSync(dataRoot, { recursive: true });
|
|
39
|
+
const dbPath = path.join(dataRoot, "data", "metadata.db");
|
|
40
|
+
const vectorPath = path.join(dataRoot, "data", "vectors.json");
|
|
41
|
+
const errorLogPath = path.join(dataRoot, "vesper_errors.log");
|
|
42
|
+
function logError(err, context) {
|
|
43
|
+
const timestamp = new Date().toISOString();
|
|
44
|
+
const stack = err.stack || String(err);
|
|
45
|
+
const msg = `[${timestamp}] ERROR in ${context}:\n${stack}\n${"-".repeat(50)}\n`;
|
|
46
|
+
fs.appendFileSync(errorLogPath, msg);
|
|
47
|
+
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
|
|
51
|
+
*/
|
|
52
|
+
function syncPythonScripts(appRoot, dataRoot) {
|
|
53
|
+
const pythonDest = path.join(dataRoot, "python");
|
|
54
|
+
if (!fs.existsSync(pythonDest))
|
|
55
|
+
fs.mkdirSync(pythonDest, { recursive: true });
|
|
56
|
+
// Sources to check for Python scripts
|
|
57
|
+
const sources = [
|
|
58
|
+
path.join(appRoot, "src", "python"),
|
|
59
|
+
path.join(appRoot, "build", "python"),
|
|
60
|
+
path.join(appRoot, "python")
|
|
61
|
+
];
|
|
62
|
+
let syncedCount = 0;
|
|
63
|
+
for (const src of sources) {
|
|
64
|
+
if (fs.existsSync(src)) {
|
|
65
|
+
const files = fs.readdirSync(src);
|
|
66
|
+
for (const file of files) {
|
|
67
|
+
if (file.endsWith(".py")) {
|
|
68
|
+
const srcPath = path.join(src, file);
|
|
69
|
+
const destPath = path.join(pythonDest, file);
|
|
70
|
+
// Only copy if file doesn't exist or is different size (basic sync)
|
|
71
|
+
const srcStat = fs.statSync(srcPath);
|
|
72
|
+
let shouldCopy = true;
|
|
73
|
+
if (fs.existsSync(destPath)) {
|
|
74
|
+
const destStat = fs.statSync(destPath);
|
|
75
|
+
if (srcStat.size === destStat.size)
|
|
76
|
+
shouldCopy = false;
|
|
77
|
+
}
|
|
78
|
+
if (shouldCopy) {
|
|
79
|
+
fs.copyFileSync(srcPath, destPath);
|
|
80
|
+
syncedCount++;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (syncedCount > 0) {
|
|
87
|
+
console.error(`[Vesper] Synced ${syncedCount} Python scripts to ${pythonDest}`);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
// Sync scripts immediately
|
|
91
|
+
syncPythonScripts(appRoot, dataRoot);
|
|
33
92
|
const metadataStore = new MetadataStore(dbPath);
|
|
34
93
|
const vectorStore = new VectorStore(vectorPath);
|
|
35
94
|
const embedder = Embedder.getInstance();
|
|
36
95
|
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
37
96
|
const jobManager = JobManager.getInstance(metadataStore);
|
|
38
|
-
|
|
39
|
-
const
|
|
97
|
+
// Use dataRoot for storage services (persistence)
|
|
98
|
+
const dataIngestor = new DataIngestor(dataRoot, metadataStore);
|
|
99
|
+
const installService = new InstallService(dataRoot, metadataStore);
|
|
40
100
|
const cacheService = new CacheService(new MockRedisProvider());
|
|
41
|
-
const
|
|
42
|
-
const
|
|
43
|
-
const
|
|
44
|
-
const
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
const
|
|
101
|
+
const dataCleaner = new DataCleaner(__dirname);
|
|
102
|
+
const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
|
|
103
|
+
const dataSplitter = new DataSplitter(__dirname);
|
|
104
|
+
const dataExporter = new DataExporter(__dirname);
|
|
105
|
+
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
106
|
+
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
107
|
+
// NOT relative to project root (appRoot)
|
|
108
|
+
process.env.PYTHONIOENCODING = "utf-8";
|
|
109
|
+
const qualityAnalyzer = new QualityAnalyzer(cacheService, __dirname);
|
|
110
|
+
const cleaningPlanner = new CleaningPlanner(cacheService, __dirname); // Pass __dirname for TargetDetector
|
|
111
|
+
const imageAnalyzer = new ImageAnalyzer(__dirname);
|
|
112
|
+
const mediaAnalyzer = new MediaAnalyzer(__dirname);
|
|
113
|
+
const qualityOrchestrator = new QualityOrchestrator(__dirname);
|
|
50
114
|
// Subscribe to job updates for real-time streaming to the UI
|
|
51
115
|
jobManager.on("jobUpdated", (job) => {
|
|
52
116
|
const level = job.status === "failed" ? "error" : "info";
|
|
@@ -57,6 +121,92 @@ jobManager.on("jobUpdated", (job) => {
|
|
|
57
121
|
data: `${emoji} [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
|
|
58
122
|
});
|
|
59
123
|
});
|
|
124
|
+
// IMPORTANT: Execute jobs when the manager emits them
|
|
125
|
+
jobManager.on("processJob", async (job, execute) => {
|
|
126
|
+
console.error(`[Vesper] Listener RECEIVED job: ${job?.id}, execute type: ${typeof execute}`);
|
|
127
|
+
if (typeof execute !== 'function') {
|
|
128
|
+
console.error(`[CRITICAL] execute is NOT a function! It is: ${typeof execute}`);
|
|
129
|
+
logError(new Error(`execute is ${typeof execute}`), "listener:execute_check");
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
const prepareDatasetTask = async () => {
|
|
133
|
+
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
134
|
+
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
135
|
+
switch (job.type) {
|
|
136
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query);
|
|
137
|
+
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
138
|
+
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
139
|
+
}
|
|
140
|
+
};
|
|
141
|
+
try {
|
|
142
|
+
console.error(`[Vesper] Calling execute(prepareDatasetTask) for ${job.id}...`);
|
|
143
|
+
await execute(prepareDatasetTask);
|
|
144
|
+
console.error(`[Vesper] execute(prepareDatasetTask) COMPLETED for ${job.id}`);
|
|
145
|
+
}
|
|
146
|
+
catch (e) {
|
|
147
|
+
logError(e, `processJob:${job.type}:${job.id}`);
|
|
148
|
+
console.error(`[Vesper] Error in execute wrapper for ${job.id}: ${e.message}`);
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
/**
|
|
152
|
+
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
153
|
+
*/
|
|
154
|
+
async function handlePrepareJob(jobId, query) {
|
|
155
|
+
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
156
|
+
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
157
|
+
const results = await searchEngine.search(query, { limit: 1 });
|
|
158
|
+
if (results.length === 0) {
|
|
159
|
+
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
160
|
+
}
|
|
161
|
+
const topDataset = results[0];
|
|
162
|
+
update({
|
|
163
|
+
progress: 20,
|
|
164
|
+
status_text: `Matched: ${topDataset.name} (${topDataset.source})`
|
|
165
|
+
});
|
|
166
|
+
const source = topDataset.source;
|
|
167
|
+
// Pre-check credentials for Kaggle
|
|
168
|
+
if (source === "kaggle") {
|
|
169
|
+
if (!process.env.KAGGLE_USERNAME || !process.env.KAGGLE_KEY ||
|
|
170
|
+
process.env.KAGGLE_USERNAME === "YOUR_KAGGLE_USERNAME") {
|
|
171
|
+
throw new Error("Kaggle credentials not set. Use 'kaggle login' or set KAGGLE_USERNAME/KAGGLE_KEY.");
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
175
|
+
// ensureData handles download and returns path to the raw file
|
|
176
|
+
const rawFilePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
|
|
177
|
+
update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
|
|
178
|
+
});
|
|
179
|
+
update({ progress: 70, status_text: "Analyzing dataset quality..." });
|
|
180
|
+
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
181
|
+
// Update local metadata with quality info
|
|
182
|
+
metadataStore.saveDataset({
|
|
183
|
+
...topDataset,
|
|
184
|
+
quality_score: report.overall_score
|
|
185
|
+
});
|
|
186
|
+
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
187
|
+
const installPath = await installService.install(topDataset.id, rawFilePath);
|
|
188
|
+
update({ progress: 100, status_text: "Preparation complete!" });
|
|
189
|
+
return installPath;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Logic for cleaning a dataset
|
|
193
|
+
*/
|
|
194
|
+
async function handleCleanJob(jobId, datasetId, ops) {
|
|
195
|
+
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
196
|
+
let filePath = path.join(dataRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
197
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
198
|
+
const demoPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
199
|
+
if (fs.existsSync(demoPath))
|
|
200
|
+
filePath = demoPath;
|
|
201
|
+
else
|
|
202
|
+
throw new Error(`Data file not found for ${datasetId}`);
|
|
203
|
+
}
|
|
204
|
+
update({ status_text: "Cleaning dataset..." });
|
|
205
|
+
const result = await dataCleaner.clean(filePath, ops);
|
|
206
|
+
if (!result.success)
|
|
207
|
+
throw new Error(result.error);
|
|
208
|
+
return result.output_path;
|
|
209
|
+
}
|
|
60
210
|
// Create the server
|
|
61
211
|
const server = new Server({
|
|
62
212
|
name: "vesper",
|
|
@@ -301,10 +451,10 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
301
451
|
}
|
|
302
452
|
case "analyze_quality": {
|
|
303
453
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
304
|
-
let filePath = path.join(
|
|
454
|
+
let filePath = path.join(dataRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
305
455
|
// Demo Fallback for easy testing
|
|
306
456
|
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
307
|
-
const demoPath = path.join(
|
|
457
|
+
const demoPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
308
458
|
if (fs.existsSync(demoPath)) {
|
|
309
459
|
filePath = demoPath;
|
|
310
460
|
}
|
|
@@ -322,9 +472,9 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
322
472
|
}
|
|
323
473
|
case "preview_cleaning": {
|
|
324
474
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
325
|
-
let filePath = path.join(
|
|
475
|
+
let filePath = path.join(dataRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
326
476
|
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
327
|
-
const demoPath = path.join(
|
|
477
|
+
const demoPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
328
478
|
if (fs.existsSync(demoPath)) {
|
|
329
479
|
filePath = demoPath;
|
|
330
480
|
}
|
|
@@ -333,8 +483,43 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
333
483
|
}
|
|
334
484
|
}
|
|
335
485
|
const report = await qualityAnalyzer.analyze(filePath);
|
|
336
|
-
|
|
486
|
+
// Phase 1: Target Detection
|
|
487
|
+
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
488
|
+
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
489
|
+
// OR let the planner handle it if we update its signature to accept filePath.
|
|
490
|
+
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
491
|
+
// We updated it to accept `targetInfo`.
|
|
492
|
+
// So we need to run detection HERE and pass it.
|
|
493
|
+
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
494
|
+
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
495
|
+
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
496
|
+
// Quick fix: Instantiate local detector or make global.
|
|
497
|
+
// I'll make a global `targetDetector` constant in index.ts
|
|
498
|
+
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
499
|
+
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
500
|
+
// RETRY STRATEGY:
|
|
501
|
+
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
502
|
+
// 2. Run `detectTarget(filePath)`.
|
|
503
|
+
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
504
|
+
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
505
|
+
// But since I'm in this tool, I can't look back.
|
|
506
|
+
// I will assume I can add it, or just do it inside the case for now.
|
|
507
|
+
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
508
|
+
// Let's do that in a separate step if needed.
|
|
509
|
+
// For now, I'll instantiate it here.
|
|
510
|
+
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
511
|
+
const detector = new TargetDetector(__dirname);
|
|
512
|
+
const targetResult = await detector.detectTarget(filePath);
|
|
513
|
+
const targetInfo = targetResult.target_column ? {
|
|
514
|
+
target: targetResult.target_column,
|
|
515
|
+
confidence: targetResult.confidence
|
|
516
|
+
} : undefined;
|
|
517
|
+
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
337
518
|
let explanation = `### 📋 Cleaning Plan for ${datasetId}\n\n`;
|
|
519
|
+
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
520
|
+
explanation += `🎯 **Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
521
|
+
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
522
|
+
}
|
|
338
523
|
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
339
524
|
if (plan.operations.length === 0) {
|
|
340
525
|
explanation += "✅ No cleaning operations required.";
|
|
@@ -351,25 +536,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
351
536
|
case "custom_clean": {
|
|
352
537
|
const datasetId = String(request.params.arguments?.dataset_id);
|
|
353
538
|
const ops = request.params.arguments?.operations;
|
|
354
|
-
let filePath = path.join(projectRoot, "data", "raw", `${datasetId.replace(/\//g, "_")}.csv`);
|
|
355
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
356
|
-
const demoPath = path.join(projectRoot, "e2e_demo_output", "raw_data.csv");
|
|
357
|
-
if (fs.existsSync(demoPath)) {
|
|
358
|
-
filePath = demoPath;
|
|
359
|
-
}
|
|
360
|
-
else {
|
|
361
|
-
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}`);
|
|
362
|
-
}
|
|
363
|
-
}
|
|
364
539
|
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
365
|
-
// Run in background
|
|
366
|
-
jobManager.runJob(job.id, async (update) => {
|
|
367
|
-
update({ status_text: "Cleaning dataset..." });
|
|
368
|
-
const result = await dataCleaner.clean(filePath, ops);
|
|
369
|
-
if (!result.success)
|
|
370
|
-
throw new Error(result.error);
|
|
371
|
-
return result.output_path;
|
|
372
|
-
});
|
|
373
540
|
return {
|
|
374
541
|
content: [{ type: "text", text: `Job started successfully. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
375
542
|
};
|
|
@@ -377,40 +544,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
377
544
|
case "prepare_dataset": {
|
|
378
545
|
const query = String(request.params.arguments?.query);
|
|
379
546
|
const job = jobManager.createJob("prepare", 0, { query });
|
|
380
|
-
// Orchestrated Background Task
|
|
381
|
-
jobManager.runJob(job.id, async (update) => {
|
|
382
|
-
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
383
|
-
const results = await searchEngine.search(query, { limit: 1 });
|
|
384
|
-
if (results.length === 0)
|
|
385
|
-
throw new Error("No datasets found matching the query.");
|
|
386
|
-
const topDataset = results[0];
|
|
387
|
-
// Phase 6: Real Ingestion
|
|
388
|
-
update({
|
|
389
|
-
progress: 20,
|
|
390
|
-
status_text: `Matched: ${topDataset.name} (${topDataset.source})`
|
|
391
|
-
});
|
|
392
|
-
const source = topDataset.source;
|
|
393
|
-
const filePath = await dataIngestor.ensureData(topDataset.id, source, (msg, prog) => {
|
|
394
|
-
update({
|
|
395
|
-
status_text: msg,
|
|
396
|
-
progress: prog !== undefined ? 20 + Math.floor(prog * 0.3) : undefined // 20% -> 50%
|
|
397
|
-
});
|
|
398
|
-
});
|
|
399
|
-
update({ progress: 55, status_text: "Analyzing dataset quality..." });
|
|
400
|
-
const quality = await qualityAnalyzer.analyze(filePath);
|
|
401
|
-
const pipelineResult = await pipelineExecutor.runPipeline(topDataset.id, filePath, "csv", (msg) => {
|
|
402
|
-
update({ status_text: msg });
|
|
403
|
-
});
|
|
404
|
-
update({ progress: 90, status_text: "Installing dataset into codebase..." });
|
|
405
|
-
const installPath = await installService.install(topDataset.id, pipelineResult.final_output_path);
|
|
406
|
-
update({ progress: 100, status_text: "Preparation complete!" });
|
|
407
|
-
const message = `✅ Preparation complete for ${topDataset.name}.\n` +
|
|
408
|
-
`📦 Dataset installed to: ${installPath}\n` +
|
|
409
|
-
`🚀 You can now use this dataset for training your models.`;
|
|
410
|
-
return message;
|
|
411
|
-
});
|
|
412
547
|
return {
|
|
413
|
-
content: [{ type: "text", text: `
|
|
548
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you.` }]
|
|
414
549
|
};
|
|
415
550
|
}
|
|
416
551
|
case "compare_datasets": {
|
|
@@ -456,7 +591,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
456
591
|
// Check if we need conversion
|
|
457
592
|
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
458
593
|
if (currentExt !== requestedFormat) {
|
|
459
|
-
console.
|
|
594
|
+
console.error(`[Export] Format mismatch (${currentExt} vs ${requestedFormat}). Converting...`);
|
|
460
595
|
try {
|
|
461
596
|
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, requestedFormat);
|
|
462
597
|
sourcePath = pipelineResult.final_output_path;
|
|
@@ -622,9 +757,74 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
622
757
|
}
|
|
623
758
|
});
|
|
624
759
|
async function main() {
|
|
760
|
+
const args = process.argv.slice(2);
|
|
761
|
+
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
762
|
+
const isSilent = args.includes("--silent");
|
|
763
|
+
// If run in setup mode OR in a terminal without args (human call), show setup wizard
|
|
764
|
+
if (isSetup || (process.stdin.isTTY && args.length === 0)) {
|
|
765
|
+
await runSetupWizard(isSilent);
|
|
766
|
+
return;
|
|
767
|
+
}
|
|
768
|
+
// Otherwise proceed to server mode (for IDEs/Agents)
|
|
769
|
+
console.error(`[Vesper] Starting server...`);
|
|
770
|
+
console.error(`[Vesper] dataRoot: ${dataRoot}`);
|
|
771
|
+
console.error(`[Vesper] dbPath: ${dbPath}`);
|
|
625
772
|
const transport = new StdioServerTransport();
|
|
626
773
|
await server.connect(transport);
|
|
627
774
|
console.error("Vesper MCP server running on stdio");
|
|
775
|
+
console.error("Tip: To configure Vesper for your IDE, run: npx @vespermcp/mcp-server --setup");
|
|
776
|
+
console.log("[Vesper] Main loop finished");
|
|
777
|
+
}
|
|
778
|
+
async function runSetupWizard(silent = false) {
|
|
779
|
+
const configManager = new ConfigManager();
|
|
780
|
+
if (!silent) {
|
|
781
|
+
console.log(`\n🚀 Welcome to Vesper MCP Setup!`);
|
|
782
|
+
console.log(`--------------------------------`);
|
|
783
|
+
console.log(`Searching for IDE configurations...`);
|
|
784
|
+
}
|
|
785
|
+
const ides = configManager.detectIDEs();
|
|
786
|
+
if (ides.length === 0) {
|
|
787
|
+
if (!silent) {
|
|
788
|
+
console.log("\n❌ No supported IDEs detected.");
|
|
789
|
+
console.log("I checked for:");
|
|
790
|
+
console.log(" - Cursor (Global & Project)");
|
|
791
|
+
console.log(" - Claude Desktop");
|
|
792
|
+
console.log(" - VS Code (Standard MCP, Copilot Chat, Cline, Roo Code)");
|
|
793
|
+
console.log("\nIf you are using VS Code or Cursor, please make sure they are installed.");
|
|
794
|
+
console.log("For project-specific setup, run this command inside your project folder.");
|
|
795
|
+
}
|
|
796
|
+
return;
|
|
797
|
+
}
|
|
798
|
+
if (silent) {
|
|
799
|
+
for (const ide of ides) {
|
|
800
|
+
await configManager.installTo(ide);
|
|
801
|
+
}
|
|
802
|
+
return;
|
|
803
|
+
}
|
|
804
|
+
console.log(`\nFound ${ides.length} potential application(s):`);
|
|
805
|
+
const selector = new Selector("Select applications to configure for Vesper:", ides.map(ide => ({
|
|
806
|
+
name: ide.name,
|
|
807
|
+
value: ide,
|
|
808
|
+
selected: true
|
|
809
|
+
})));
|
|
810
|
+
const selectedIDEs = await selector.run();
|
|
811
|
+
if (selectedIDEs.length > 0) {
|
|
812
|
+
console.log(`\nInstalling to ${selectedIDEs.length} application(s)...\n`);
|
|
813
|
+
for (const ide of selectedIDEs) {
|
|
814
|
+
process.stdout.write(`Installing to ${ide.name}... `);
|
|
815
|
+
const success = await configManager.installTo(ide);
|
|
816
|
+
if (success) {
|
|
817
|
+
process.stdout.write("✅\n");
|
|
818
|
+
}
|
|
819
|
+
else {
|
|
820
|
+
process.stdout.write("❌ (Check permissions or if file is in use)\n");
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
console.log("\n✨ Setup complete! Please RESTART your IDE(s) to apply changes.");
|
|
824
|
+
}
|
|
825
|
+
else {
|
|
826
|
+
console.log("\nSetup skipped. No applications selected.");
|
|
827
|
+
}
|
|
628
828
|
}
|
|
629
829
|
main().catch((error) => {
|
|
630
830
|
console.error("Server error:", error);
|
|
@@ -18,6 +18,18 @@ export class DataIngestor {
|
|
|
18
18
|
this.hfDownloader = new HFDownloader();
|
|
19
19
|
this.kaggleDownloader = new KaggleDownloader();
|
|
20
20
|
}
|
|
21
|
+
/**
|
|
22
|
+
* Check if Kaggle credentials are available
|
|
23
|
+
*/
|
|
24
|
+
hasKaggleCredentials() {
|
|
25
|
+
return this.kaggleDownloader.hasCredentials();
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Get helpful error message if Kaggle credentials are missing
|
|
29
|
+
*/
|
|
30
|
+
getKaggleCredentialError() {
|
|
31
|
+
return this.kaggleDownloader.getCredentialError();
|
|
32
|
+
}
|
|
21
33
|
/**
|
|
22
34
|
* Ensures a dataset is available locally
|
|
23
35
|
*/
|
|
@@ -28,7 +40,7 @@ export class DataIngestor {
|
|
|
28
40
|
return status.local_path;
|
|
29
41
|
}
|
|
30
42
|
if (status && status.status === 'downloading') {
|
|
31
|
-
console.
|
|
43
|
+
console.error(`[Ingestor] Dataset ${datasetId} status is 'downloading'. Attempting to resume/concurrently monitor.`);
|
|
32
44
|
// In a better system we'd use a lock, but for now we let it resume
|
|
33
45
|
// the RobustDownloader handles the actual file locking/range logic.
|
|
34
46
|
}
|
|
@@ -55,21 +67,10 @@ export class DataIngestor {
|
|
|
55
67
|
}
|
|
56
68
|
}
|
|
57
69
|
else if (source === "kaggle") {
|
|
58
|
-
|
|
59
|
-
const
|
|
60
|
-
this.
|
|
61
|
-
|
|
62
|
-
const primaryFile = await this.kaggleDownloader.download(datasetId.replace("kaggle:", ""), targetDir, (progress) => {
|
|
63
|
-
onProgress?.("Downloading Kaggle archive...", progress);
|
|
64
|
-
});
|
|
65
|
-
const stats = fs.statSync(primaryFile);
|
|
66
|
-
this.completeDownload(datasetId, primaryFile, stats.size);
|
|
67
|
-
return primaryFile;
|
|
68
|
-
}
|
|
69
|
-
catch (e) {
|
|
70
|
-
this.failDownload(datasetId, e.message);
|
|
71
|
-
throw e;
|
|
72
|
-
}
|
|
70
|
+
// Kaggle support has been disabled
|
|
71
|
+
const errorMsg = "Kaggle datasets are no longer supported. Please use HuggingFace or other open-access sources.";
|
|
72
|
+
this.failDownload(datasetId, errorMsg);
|
|
73
|
+
throw new Error(errorMsg);
|
|
73
74
|
}
|
|
74
75
|
throw new Error(`Download logic for ${source} not yet implemented`);
|
|
75
76
|
}
|
|
@@ -11,13 +11,36 @@ export class KaggleDownloader {
|
|
|
11
11
|
this.key = key || process.env.KAGGLE_KEY || "";
|
|
12
12
|
this.downloader = new RobustDownloader();
|
|
13
13
|
}
|
|
14
|
+
/**
|
|
15
|
+
* Check if Kaggle credentials are available
|
|
16
|
+
*/
|
|
17
|
+
hasCredentials() {
|
|
18
|
+
return !!(this.username && this.key);
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Get a helpful error message if credentials are missing
|
|
22
|
+
*/
|
|
23
|
+
getCredentialError() {
|
|
24
|
+
if (!this.username && !this.key) {
|
|
25
|
+
return "Kaggle credentials missing. Please set KAGGLE_USERNAME and KAGGLE_KEY environment variables.\n" +
|
|
26
|
+
"💡 Tip: Get your API token from https://www.kaggle.com/settings → API → Create New Token\n" +
|
|
27
|
+
"💡 Alternative: Download the dataset manually and use analyze_quality() on local files.";
|
|
28
|
+
}
|
|
29
|
+
if (!this.username) {
|
|
30
|
+
return "KAGGLE_USERNAME is missing. Please set it in your MCP config or environment variables.";
|
|
31
|
+
}
|
|
32
|
+
if (!this.key) {
|
|
33
|
+
return "KAGGLE_KEY is missing. Please set it in your MCP config or environment variables.";
|
|
34
|
+
}
|
|
35
|
+
return "";
|
|
36
|
+
}
|
|
14
37
|
/**
|
|
15
38
|
* Downloads and extracts a Kaggle dataset
|
|
16
39
|
* returns the path to the primary data file
|
|
17
40
|
*/
|
|
18
41
|
async download(repoId, targetDir, onProgress) {
|
|
19
|
-
if (!this.
|
|
20
|
-
throw new Error(
|
|
42
|
+
if (!this.hasCredentials()) {
|
|
43
|
+
throw new Error(this.getCredentialError());
|
|
21
44
|
}
|
|
22
45
|
const auth = Buffer.from(`${this.username}:${this.key}`).toString('base64');
|
|
23
46
|
const url = `https://www.kaggle.com/api/v1/datasets/download/${repoId}`;
|
|
@@ -35,7 +35,7 @@ export class InstallService {
|
|
|
35
35
|
// Update metadata
|
|
36
36
|
const absolutePath = path.resolve(targetPath);
|
|
37
37
|
this.metadataStore.updateInstallPath(datasetId, absolutePath);
|
|
38
|
-
console.
|
|
38
|
+
console.error(`[InstallService] Dataset ${datasetId} installed to ${absolutePath}`);
|
|
39
39
|
return absolutePath;
|
|
40
40
|
}
|
|
41
41
|
}
|
package/build/jobs/manager.js
CHANGED
|
@@ -64,11 +64,22 @@ export class JobManager extends EventEmitter {
|
|
|
64
64
|
this.activeWorkers++;
|
|
65
65
|
this.updateJob(job.id, { status: "running", status_text: "Picked up by worker" });
|
|
66
66
|
const startTime = Date.now();
|
|
67
|
+
const listeners = this.listenerCount("processJob");
|
|
68
|
+
console.error(`[JobManager] Emitting processJob for ${job.id}. Active listeners: ${listeners}`);
|
|
67
69
|
// In a real system, we'd have a registry of handlers for each JobType.
|
|
68
70
|
// For now, we emit an event so the orchestrator can run it.
|
|
69
|
-
this.emit("processJob", job, async (
|
|
71
|
+
this.emit("processJob", job, async (jobExecutionTask) => {
|
|
72
|
+
console.error(`[JobManager] Wrapper received jobExecutionTask: ${typeof jobExecutionTask}`);
|
|
73
|
+
if (typeof jobExecutionTask !== 'function') {
|
|
74
|
+
console.error(`[JobManager] Error: jobExecutionTask is NOT a function! It is: ${typeof jobExecutionTask}`);
|
|
75
|
+
this.updateJob(job.id, {
|
|
76
|
+
status: "failed",
|
|
77
|
+
status_text: "Internal error: jobExecutionTask is not a function"
|
|
78
|
+
});
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
70
81
|
try {
|
|
71
|
-
const resultUrl = await
|
|
82
|
+
const resultUrl = await jobExecutionTask();
|
|
72
83
|
const duration = Date.now() - startTime;
|
|
73
84
|
this.updateJob(job.id, {
|
|
74
85
|
status: "completed",
|
|
@@ -103,14 +114,6 @@ export class JobManager extends EventEmitter {
|
|
|
103
114
|
// Try to start another worker if capacity allows
|
|
104
115
|
this.processQueue();
|
|
105
116
|
}
|
|
106
|
-
/**
|
|
107
|
-
* Helper to run a task as a job with automatic status updates
|
|
108
|
-
*/
|
|
109
|
-
runJob(id, task) {
|
|
110
|
-
this.emit("processJob", { id }, async () => {
|
|
111
|
-
return await task((updates) => this.updateJob(id, updates));
|
|
112
|
-
});
|
|
113
|
-
}
|
|
114
117
|
/**
|
|
115
118
|
* Update job status and progress
|
|
116
119
|
*/
|
|
@@ -118,6 +121,10 @@ export class JobManager extends EventEmitter {
|
|
|
118
121
|
const job = this.store.getJob(id);
|
|
119
122
|
if (!job)
|
|
120
123
|
return;
|
|
124
|
+
// Correctly handle metadata update if it's an object
|
|
125
|
+
if (updates.metadata && typeof updates.metadata !== 'string') {
|
|
126
|
+
updates.metadata = JSON.stringify(updates.metadata);
|
|
127
|
+
}
|
|
121
128
|
const updatedJob = {
|
|
122
129
|
...job,
|
|
123
130
|
...updates,
|
|
@@ -90,7 +90,7 @@ export class MonitoringService {
|
|
|
90
90
|
}
|
|
91
91
|
}
|
|
92
92
|
async sendToWebhook(webhook, diff) {
|
|
93
|
-
console.
|
|
93
|
+
console.error(`[MonitoringService] Sending notification to ${webhook.name} (${webhook.channel}) for dataset ${diff.dataset_id}`);
|
|
94
94
|
// In a real implementation, this would be an HTTP POST
|
|
95
95
|
// For now, we simulate the payload
|
|
96
96
|
const payload = {
|
|
@@ -101,7 +101,7 @@ export class MonitoringService {
|
|
|
101
101
|
// await axios.post(webhook.url, payload);
|
|
102
102
|
}
|
|
103
103
|
async triggerReprocess(datasetId) {
|
|
104
|
-
console.
|
|
104
|
+
console.error(`[MonitoringService] Auto-reprocessing dataset ${datasetId}...`);
|
|
105
105
|
// This would call IngestionService or similar
|
|
106
106
|
}
|
|
107
107
|
}
|
|
@@ -8,8 +8,8 @@ export class HuggingFaceScraper {
|
|
|
8
8
|
* Bulk discovery: Fetch many datasets quickly without deep details.
|
|
9
9
|
* Hits the 25k target in minutes.
|
|
10
10
|
*/
|
|
11
|
-
async scrapeBulk(limit = 1000,
|
|
12
|
-
const filterMsg =
|
|
11
|
+
async scrapeBulk(limit = 1000, query) {
|
|
12
|
+
const filterMsg = query ? `, query: ${query}` : "";
|
|
13
13
|
console.error(`[Bulk Scraper] Fetching datasets (target limit: ${limit}${filterMsg})...`);
|
|
14
14
|
const results = [];
|
|
15
15
|
let processed = 0;
|
|
@@ -18,7 +18,7 @@ export class HuggingFaceScraper {
|
|
|
18
18
|
for await (const ds of listDatasets({
|
|
19
19
|
limit: limit,
|
|
20
20
|
additionalFields: ["description", "tags", "downloadsAllTime", "createdAt"],
|
|
21
|
-
search: { query:
|
|
21
|
+
search: { query: query },
|
|
22
22
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
23
23
|
})) {
|
|
24
24
|
if (results.length >= limit)
|
|
@@ -86,9 +86,9 @@ export class HuggingFaceScraper {
|
|
|
86
86
|
}
|
|
87
87
|
return results;
|
|
88
88
|
}
|
|
89
|
-
async scrape(limit = 100, applyMVPFilters = true,
|
|
89
|
+
async scrape(limit = 100, applyMVPFilters = true, query // Use as general search query
|
|
90
90
|
) {
|
|
91
|
-
const filterMsg =
|
|
91
|
+
const filterMsg = query ? `, query: ${query}` : "";
|
|
92
92
|
console.error(`Fetching datasets (target limit: ${limit}, MVP filters: ${applyMVPFilters}${filterMsg})...`);
|
|
93
93
|
const results = [];
|
|
94
94
|
let processed = 0;
|
|
@@ -105,7 +105,7 @@ export class HuggingFaceScraper {
|
|
|
105
105
|
for await (const ds of listDatasets({
|
|
106
106
|
limit: fetchLimit,
|
|
107
107
|
additionalFields: ["description", "tags"],
|
|
108
|
-
search: { query:
|
|
108
|
+
search: { query: query },
|
|
109
109
|
...(hfToken ? { accessToken: hfToken } : {})
|
|
110
110
|
})) {
|
|
111
111
|
if (results.length >= limit)
|
|
@@ -192,8 +192,8 @@ export class HuggingFaceScraper {
|
|
|
192
192
|
const columns = this.extractColumns(cardData, splits);
|
|
193
193
|
const task = this.extractTask(tags);
|
|
194
194
|
const domain = classifyDomain(description, tags, repoId, task);
|
|
195
|
-
|
|
196
|
-
|
|
195
|
+
// REMOVED strict domain filtering that caused search bias
|
|
196
|
+
// if (query && domain !== query) return;
|
|
197
197
|
const metadata = {
|
|
198
198
|
id: repoId,
|
|
199
199
|
source: "huggingface",
|