@vespermcp/mcp-server 1.2.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/build/index.js +100 -38
- package/build/ingestion/kaggle-downloader.js +2 -2
- package/build/search/jit-orchestrator.js +12 -12
- package/build/tools/formatter.js +14 -14
- package/package.json +2 -2
package/build/index.js
CHANGED
|
@@ -55,6 +55,65 @@ function logError(err, context) {
|
|
|
55
55
|
fs.appendFileSync(errorLogPath, msg);
|
|
56
56
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
57
57
|
}
|
|
58
|
+
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
59
|
+
function printLaunchScreen() {
|
|
60
|
+
const screen = `
|
|
61
|
+
══════════════════════════════════════════════
|
|
62
|
+
|
|
63
|
+
██ ██ ███████ ███████ ██████ ███████ ██████
|
|
64
|
+
██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
65
|
+
██ ██ █████ █████ ██████ █████ ██████
|
|
66
|
+
██ ██ ██ ██ ██ ██ ██ ██
|
|
67
|
+
████ ███████ ███████ ██ ███████ ██ ██
|
|
68
|
+
|
|
69
|
+
dataset intelligence layer
|
|
70
|
+
mcp-native • agent-first
|
|
71
|
+
|
|
72
|
+
══════════════════════════════════════════════
|
|
73
|
+
|
|
74
|
+
[ core ] initializing
|
|
75
|
+
[ splitting ] leakage-safe
|
|
76
|
+
[ quality ] multimodal scan
|
|
77
|
+
[ fusion ] guarded
|
|
78
|
+
[ synth ] generation ready
|
|
79
|
+
|
|
80
|
+
status: operational
|
|
81
|
+
`;
|
|
82
|
+
console.error(screen);
|
|
83
|
+
}
|
|
84
|
+
async function runWithSpinner(label, task) {
|
|
85
|
+
if (!process.stderr.isTTY) {
|
|
86
|
+
return task();
|
|
87
|
+
}
|
|
88
|
+
let frameIndex = 0;
|
|
89
|
+
let timer;
|
|
90
|
+
let spinnerShown = false;
|
|
91
|
+
const delayedStart = setTimeout(() => {
|
|
92
|
+
spinnerShown = true;
|
|
93
|
+
timer = setInterval(() => {
|
|
94
|
+
const frame = SPINNER_FRAMES[frameIndex % SPINNER_FRAMES.length];
|
|
95
|
+
frameIndex += 1;
|
|
96
|
+
process.stderr.write(`\r${frame} ${label}`);
|
|
97
|
+
}, 90);
|
|
98
|
+
}, 1000);
|
|
99
|
+
try {
|
|
100
|
+
const result = await task();
|
|
101
|
+
clearTimeout(delayedStart);
|
|
102
|
+
if (timer)
|
|
103
|
+
clearInterval(timer);
|
|
104
|
+
if (spinnerShown)
|
|
105
|
+
process.stderr.write(`\r[ok] ${label} \n`);
|
|
106
|
+
return result;
|
|
107
|
+
}
|
|
108
|
+
catch (error) {
|
|
109
|
+
clearTimeout(delayedStart);
|
|
110
|
+
if (timer)
|
|
111
|
+
clearInterval(timer);
|
|
112
|
+
if (spinnerShown)
|
|
113
|
+
process.stderr.write(`\r[error] ${label} \n`);
|
|
114
|
+
throw error;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
58
117
|
function extractRequestedRows(query, requirements) {
|
|
59
118
|
const text = `${query || ""} ${requirements || ""}`.toLowerCase();
|
|
60
119
|
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
@@ -182,11 +241,11 @@ const qualityOrchestrator = new QualityOrchestrator(__dirname);
|
|
|
182
241
|
// Subscribe to job updates for real-time streaming to the UI
|
|
183
242
|
jobManager.on("jobUpdated", (job) => {
|
|
184
243
|
const level = job.status === "failed" ? "error" : "info";
|
|
185
|
-
const
|
|
244
|
+
const statusTag = job.status === "completed" ? "done" : (job.status === "failed" ? "failed" : "running");
|
|
186
245
|
const progress = job.progress > 0 ? `[${job.progress}%]` : "";
|
|
187
246
|
server.sendLoggingMessage({
|
|
188
247
|
level,
|
|
189
|
-
data:
|
|
248
|
+
data: `[${statusTag}] [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
|
|
190
249
|
});
|
|
191
250
|
});
|
|
192
251
|
// IMPORTANT: Execute jobs when the manager emits them
|
|
@@ -292,7 +351,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
292
351
|
throw new Error(`Requested ${requestedRows.toLocaleString()} samples, but only ${currentRows.toLocaleString()} available across current matches. ` +
|
|
293
352
|
`Try broader query or enable additional sources.`);
|
|
294
353
|
}
|
|
295
|
-
update({ progress: 69, status_text:
|
|
354
|
+
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
296
355
|
}
|
|
297
356
|
}
|
|
298
357
|
update({ progress: 70, status_text: "Analyzing dataset quality..." });
|
|
@@ -759,7 +818,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
759
818
|
try {
|
|
760
819
|
const localPath = await dataIngestor.ensureData(datasetId, source, () => undefined);
|
|
761
820
|
return {
|
|
762
|
-
content: [{ type: "text", text:
|
|
821
|
+
content: [{ type: "text", text: `Download complete: ${localPath}` }]
|
|
763
822
|
};
|
|
764
823
|
}
|
|
765
824
|
catch (error) {
|
|
@@ -780,7 +839,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
780
839
|
process.env.KAGGLE_USERNAME = username;
|
|
781
840
|
process.env.KAGGLE_KEY = key;
|
|
782
841
|
return {
|
|
783
|
-
content: [{ type: "text", text:
|
|
842
|
+
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
784
843
|
};
|
|
785
844
|
}
|
|
786
845
|
case "configure_keys": {
|
|
@@ -822,7 +881,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
822
881
|
};
|
|
823
882
|
}
|
|
824
883
|
return {
|
|
825
|
-
content: [{ type: "text", text:
|
|
884
|
+
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
826
885
|
};
|
|
827
886
|
}
|
|
828
887
|
case "get_dataset_info": {
|
|
@@ -906,14 +965,14 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
906
965
|
confidence: targetResult.confidence
|
|
907
966
|
} : undefined;
|
|
908
967
|
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
909
|
-
let explanation = `###
|
|
968
|
+
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
910
969
|
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
911
|
-
explanation +=
|
|
970
|
+
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
912
971
|
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
913
972
|
}
|
|
914
973
|
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
915
974
|
if (plan.operations.length === 0) {
|
|
916
|
-
explanation += "
|
|
975
|
+
explanation += "No cleaning operations required.";
|
|
917
976
|
}
|
|
918
977
|
else {
|
|
919
978
|
plan.operations.forEach((op, i) => {
|
|
@@ -1003,7 +1062,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1003
1062
|
}
|
|
1004
1063
|
}
|
|
1005
1064
|
else {
|
|
1006
|
-
console.error(`[Export]
|
|
1065
|
+
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
1007
1066
|
}
|
|
1008
1067
|
// Build export options
|
|
1009
1068
|
const exportOpts = {};
|
|
@@ -1026,7 +1085,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1026
1085
|
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
1027
1086
|
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
1028
1087
|
// Build rich response
|
|
1029
|
-
let msg =
|
|
1088
|
+
let msg = `**Export complete**\n`;
|
|
1030
1089
|
msg += `- **File**: ${result.output_path}\n`;
|
|
1031
1090
|
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
1032
1091
|
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
@@ -1038,12 +1097,12 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1038
1097
|
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
1039
1098
|
msg += `\n`;
|
|
1040
1099
|
if (requestedFormat === "feather") {
|
|
1041
|
-
msg +=
|
|
1100
|
+
msg += `**Inspect with:**\n`;
|
|
1042
1101
|
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
1043
1102
|
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1044
1103
|
}
|
|
1045
1104
|
else if (requestedFormat === "parquet") {
|
|
1046
|
-
msg +=
|
|
1105
|
+
msg += `**Inspect with:**\n`;
|
|
1047
1106
|
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
1048
1107
|
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1049
1108
|
}
|
|
@@ -1113,7 +1172,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1113
1172
|
});
|
|
1114
1173
|
const nullDelta = result.stats.null_delta;
|
|
1115
1174
|
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1116
|
-
let msg =
|
|
1175
|
+
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
1117
1176
|
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
1118
1177
|
msg += `- Null change: ${nullText}\n`;
|
|
1119
1178
|
msg += `- Output: ${result.output_path}\n`;
|
|
@@ -1143,16 +1202,16 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1143
1202
|
}
|
|
1144
1203
|
try {
|
|
1145
1204
|
const report = await imageAnalyzer.analyze(inputPath);
|
|
1146
|
-
let output = `##
|
|
1205
|
+
let output = `## Image Quality Report\n\n`;
|
|
1147
1206
|
output += `- **Total Images**: ${report.total_images}\n`;
|
|
1148
1207
|
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
1149
1208
|
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
1150
1209
|
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
1151
1210
|
if (report.individual_results.length > 0) {
|
|
1152
|
-
output += `###
|
|
1211
|
+
output += `### Sample Detail (Top 5)\n`;
|
|
1153
1212
|
report.individual_results.slice(0, 5).forEach(img => {
|
|
1154
|
-
const
|
|
1155
|
-
output += `${
|
|
1213
|
+
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
1214
|
+
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
1156
1215
|
});
|
|
1157
1216
|
}
|
|
1158
1217
|
return {
|
|
@@ -1173,7 +1232,7 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1173
1232
|
}
|
|
1174
1233
|
try {
|
|
1175
1234
|
const report = await mediaAnalyzer.analyze(inputPath);
|
|
1176
|
-
let output = `##
|
|
1235
|
+
let output = `## Media Quality Report\n\n`;
|
|
1177
1236
|
output += `- **Total Files**: ${report.total_files}\n`;
|
|
1178
1237
|
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
1179
1238
|
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
@@ -1184,17 +1243,17 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1184
1243
|
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
1185
1244
|
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
1186
1245
|
}
|
|
1187
|
-
output += `\n###
|
|
1246
|
+
output += `\n### Sample Detail (Top 5)\n`;
|
|
1188
1247
|
report.details.slice(0, 5).forEach(item => {
|
|
1189
|
-
const
|
|
1248
|
+
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
1190
1249
|
if (item.type === "audio" && 'sample_rate' in item) {
|
|
1191
|
-
output += `${
|
|
1250
|
+
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
1192
1251
|
}
|
|
1193
1252
|
else if (item.type === "video" && 'width' in item) {
|
|
1194
|
-
output += `${
|
|
1253
|
+
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
1195
1254
|
}
|
|
1196
1255
|
else {
|
|
1197
|
-
output += `${
|
|
1256
|
+
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
1198
1257
|
}
|
|
1199
1258
|
});
|
|
1200
1259
|
return {
|
|
@@ -1225,39 +1284,39 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
|
1225
1284
|
metadata.unified_quality_report = report;
|
|
1226
1285
|
await metadataStore.saveDataset(metadata);
|
|
1227
1286
|
}
|
|
1228
|
-
let output = `#
|
|
1287
|
+
let output = `# Unified Quality Report\n\n`;
|
|
1229
1288
|
output += `**Dataset**: ${datasetId}\n`;
|
|
1230
1289
|
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
1231
1290
|
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
1232
1291
|
if (report.text_quality) {
|
|
1233
|
-
output += `##
|
|
1292
|
+
output += `## Text Quality\n`;
|
|
1234
1293
|
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
1235
1294
|
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
1236
1295
|
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
1237
1296
|
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
1238
1297
|
}
|
|
1239
1298
|
if (report.image_quality) {
|
|
1240
|
-
output += `##
|
|
1299
|
+
output += `## Image Quality\n`;
|
|
1241
1300
|
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
1242
1301
|
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
1243
1302
|
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
1244
1303
|
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
1245
1304
|
}
|
|
1246
1305
|
if (report.audio_quality) {
|
|
1247
|
-
output += `##
|
|
1306
|
+
output += `## Audio Quality\n`;
|
|
1248
1307
|
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
1249
1308
|
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
1250
1309
|
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
1251
1310
|
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
1252
1311
|
}
|
|
1253
1312
|
if (report.video_quality) {
|
|
1254
|
-
output += `##
|
|
1313
|
+
output += `## Video Quality\n`;
|
|
1255
1314
|
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
1256
1315
|
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
1257
1316
|
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
1258
1317
|
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
1259
1318
|
}
|
|
1260
|
-
output += `##
|
|
1319
|
+
output += `## Recommendations\n`;
|
|
1261
1320
|
report.recommendations.forEach(rec => {
|
|
1262
1321
|
output += `- ${rec}\n`;
|
|
1263
1322
|
});
|
|
@@ -1285,6 +1344,9 @@ async function main() {
|
|
|
1285
1344
|
const isConfig = args.includes("config");
|
|
1286
1345
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
1287
1346
|
const isSilent = args.includes("--silent");
|
|
1347
|
+
if (process.stdin.isTTY && !isSilent) {
|
|
1348
|
+
printLaunchScreen();
|
|
1349
|
+
}
|
|
1288
1350
|
if (isFuse) {
|
|
1289
1351
|
await runFuseCli(args);
|
|
1290
1352
|
return;
|
|
@@ -1332,7 +1394,7 @@ async function runConfigCli(args) {
|
|
|
1332
1394
|
return undefined;
|
|
1333
1395
|
};
|
|
1334
1396
|
if (isKeys) {
|
|
1335
|
-
console.log("\
|
|
1397
|
+
console.log("\nVesper Optional Keys Setup");
|
|
1336
1398
|
console.log("(Press Enter to skip any field)\n");
|
|
1337
1399
|
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
1338
1400
|
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
@@ -1367,7 +1429,7 @@ async function runConfigCli(args) {
|
|
|
1367
1429
|
console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
|
|
1368
1430
|
return;
|
|
1369
1431
|
}
|
|
1370
|
-
console.log(
|
|
1432
|
+
console.log(`Key(s) saved securely: ${saved.join(", ")}`);
|
|
1371
1433
|
console.log("You can now use Kaggle and gated Hugging Face datasets.");
|
|
1372
1434
|
return;
|
|
1373
1435
|
}
|
|
@@ -1391,7 +1453,7 @@ async function runConfigCli(args) {
|
|
|
1391
1453
|
secureKeys.set("kaggle_key", key);
|
|
1392
1454
|
process.env.KAGGLE_USERNAME = username;
|
|
1393
1455
|
process.env.KAGGLE_KEY = key;
|
|
1394
|
-
console.log("
|
|
1456
|
+
console.log("Key saved securely. You can now use Kaggle datasets.");
|
|
1395
1457
|
}
|
|
1396
1458
|
async function runDiscoverCli(args) {
|
|
1397
1459
|
const getArgValue = (name) => {
|
|
@@ -1509,7 +1571,7 @@ async function runDownloadCli(args) {
|
|
|
1509
1571
|
}
|
|
1510
1572
|
process.exit(1);
|
|
1511
1573
|
}
|
|
1512
|
-
console.log(
|
|
1574
|
+
console.log(`Download complete: ${localPath}`);
|
|
1513
1575
|
}
|
|
1514
1576
|
async function runFuseCli(args) {
|
|
1515
1577
|
const getArgValue = (name) => {
|
|
@@ -1569,21 +1631,21 @@ async function runFuseCli(args) {
|
|
|
1569
1631
|
async function runSetupWizard(silent = false) {
|
|
1570
1632
|
const configManager = new ConfigManager();
|
|
1571
1633
|
if (!silent) {
|
|
1572
|
-
console.log(`\
|
|
1634
|
+
console.log(`\nVesper MCP - Universal Setup`);
|
|
1573
1635
|
console.log(`================================`);
|
|
1574
1636
|
console.log(`Installing to all detected coding agents...\n`);
|
|
1575
1637
|
}
|
|
1576
|
-
const result = await configManager.installToAll();
|
|
1638
|
+
const result = await runWithSpinner("Installing to detected coding agents", () => configManager.installToAll());
|
|
1577
1639
|
if (result.success.length === 0 && result.failed.length === 0) {
|
|
1578
1640
|
if (!silent) {
|
|
1579
|
-
console.log("\
|
|
1641
|
+
console.log("\nNo supported agents detected.");
|
|
1580
1642
|
console.log("Supported agents: Claude Code, Claude Desktop, Cursor, VS Code, Codex, Antigravity");
|
|
1581
1643
|
console.log("\nMake sure at least one is installed, then try again.");
|
|
1582
1644
|
}
|
|
1583
1645
|
return;
|
|
1584
1646
|
}
|
|
1585
1647
|
if (!silent) {
|
|
1586
|
-
console.log("
|
|
1648
|
+
console.log("Setup complete! Please RESTART your IDE(s) to apply changes.");
|
|
1587
1649
|
}
|
|
1588
1650
|
}
|
|
1589
1651
|
main().catch((error) => {
|
|
@@ -23,8 +23,8 @@ export class KaggleDownloader {
|
|
|
23
23
|
getCredentialError() {
|
|
24
24
|
if (!this.username && !this.key) {
|
|
25
25
|
return "Kaggle credentials missing. Please set KAGGLE_USERNAME and KAGGLE_KEY environment variables.\n" +
|
|
26
|
-
"
|
|
27
|
-
"
|
|
26
|
+
"Tip: Get your API token from https://www.kaggle.com/settings -> API -> Create New Token\n" +
|
|
27
|
+
"Alternative: Download the dataset manually and use analyze_quality() on local files.";
|
|
28
28
|
}
|
|
29
29
|
if (!this.username) {
|
|
30
30
|
return "KAGGLE_USERNAME is missing. Please set it in your MCP config or environment variables.";
|
|
@@ -64,15 +64,15 @@ export class JITOrchestrator {
|
|
|
64
64
|
async fetchAndIngest(query, limit = 10) {
|
|
65
65
|
// Rate limiting check
|
|
66
66
|
if (!this.canTrigger(query)) {
|
|
67
|
-
console.error(`[JIT]
|
|
67
|
+
console.error(`[JIT] Query "${query}" was searched recently. Waiting...`);
|
|
68
68
|
return [];
|
|
69
69
|
}
|
|
70
|
-
console.error(`\n[JIT]
|
|
70
|
+
console.error(`\n[JIT] Searching live sources for: "${query}"`);
|
|
71
71
|
this.lastTriggerTime.set(query, Date.now());
|
|
72
72
|
// Simplify query for better API results
|
|
73
73
|
const keywords = this.simplifyQuery(query);
|
|
74
74
|
if (keywords.length > 0) {
|
|
75
|
-
console.error(`[JIT]
|
|
75
|
+
console.error(`[JIT] Keywords extracted: ${keywords.join(", ")}`);
|
|
76
76
|
}
|
|
77
77
|
const newDatasets = [];
|
|
78
78
|
const existingIds = new Set();
|
|
@@ -94,7 +94,7 @@ export class JITOrchestrator {
|
|
|
94
94
|
}
|
|
95
95
|
}
|
|
96
96
|
sourceResults["HuggingFace"] = hfResults.length;
|
|
97
|
-
console.error(`
|
|
97
|
+
console.error(` [source] HuggingFace: ${hfResults.length} datasets`);
|
|
98
98
|
for (const ds of hfResults) {
|
|
99
99
|
if (!existingIds.has(ds.id)) {
|
|
100
100
|
newDatasets.push(ds);
|
|
@@ -104,7 +104,7 @@ export class JITOrchestrator {
|
|
|
104
104
|
// 2. Scrape UCI (Open Access)
|
|
105
105
|
const uciResults = await this.scrapeUCI(query, Math.floor(limit / 2));
|
|
106
106
|
sourceResults["UCI"] = uciResults.length;
|
|
107
|
-
console.error(`
|
|
107
|
+
console.error(` [source] UCI: ${uciResults.length} datasets`);
|
|
108
108
|
for (const ds of uciResults) {
|
|
109
109
|
if (!existingIds.has(ds.id)) {
|
|
110
110
|
newDatasets.push(ds);
|
|
@@ -114,7 +114,7 @@ export class JITOrchestrator {
|
|
|
114
114
|
// 3. Scrape GitHub (Open Access)
|
|
115
115
|
const githubResults = await this.scrapeGitHub(query, Math.floor(limit / 2));
|
|
116
116
|
sourceResults["GitHub"] = githubResults.length;
|
|
117
|
-
console.error(`
|
|
117
|
+
console.error(` [source] GitHub: ${githubResults.length} datasets`);
|
|
118
118
|
for (const ds of githubResults) {
|
|
119
119
|
if (!existingIds.has(ds.id)) {
|
|
120
120
|
newDatasets.push(ds);
|
|
@@ -124,7 +124,7 @@ export class JITOrchestrator {
|
|
|
124
124
|
// 4. Scrape World Bank (Open Access) - Economic/demographic data
|
|
125
125
|
const wbResults = await this.scrapeWorldBank(query, Math.floor(limit / 2));
|
|
126
126
|
sourceResults["WorldBank"] = wbResults.length;
|
|
127
|
-
console.error(`
|
|
127
|
+
console.error(` [source] World Bank: ${wbResults.length} datasets`);
|
|
128
128
|
for (const ds of wbResults) {
|
|
129
129
|
if (!existingIds.has(ds.id)) {
|
|
130
130
|
newDatasets.push(ds);
|
|
@@ -134,7 +134,7 @@ export class JITOrchestrator {
|
|
|
134
134
|
// 5. Scrape NASA (Open Access) - Scientific/space data
|
|
135
135
|
const nasaResults = await this.scrapeNASA(query, Math.floor(limit / 2));
|
|
136
136
|
sourceResults["NASA"] = nasaResults.length;
|
|
137
|
-
console.error(`
|
|
137
|
+
console.error(` [source] NASA: ${nasaResults.length} datasets`);
|
|
138
138
|
for (const ds of nasaResults) {
|
|
139
139
|
if (!existingIds.has(ds.id)) {
|
|
140
140
|
newDatasets.push(ds);
|
|
@@ -144,17 +144,17 @@ export class JITOrchestrator {
|
|
|
144
144
|
// Save and index new datasets
|
|
145
145
|
if (newDatasets.length > 0) {
|
|
146
146
|
await this.saveAndIndex(newDatasets);
|
|
147
|
-
console.error(`[JIT]
|
|
147
|
+
console.error(`[JIT] Indexed ${newDatasets.length} new datasets`);
|
|
148
148
|
}
|
|
149
149
|
else {
|
|
150
150
|
// Provide helpful feedback when no results found
|
|
151
151
|
const allZero = Object.values(sourceResults).every(v => v === 0);
|
|
152
152
|
if (allZero) {
|
|
153
|
-
console.error(`[JIT]
|
|
154
|
-
console.error(`[JIT]
|
|
153
|
+
console.error(`[JIT] No datasets found across all sources.`);
|
|
154
|
+
console.error(`[JIT] Try: broader keywords, or set HF_TOKEN for better HuggingFace access`);
|
|
155
155
|
}
|
|
156
156
|
else {
|
|
157
|
-
console.error(`[JIT]
|
|
157
|
+
console.error(`[JIT] All found datasets already in index`);
|
|
158
158
|
}
|
|
159
159
|
}
|
|
160
160
|
return newDatasets;
|
package/build/tools/formatter.js
CHANGED
|
@@ -2,29 +2,29 @@
|
|
|
2
2
|
* Format job status for visual representation
|
|
3
3
|
*/
|
|
4
4
|
export function formatJobStatus(job) {
|
|
5
|
-
const
|
|
6
|
-
"pending": "
|
|
7
|
-
"queued": "
|
|
8
|
-
"running": "
|
|
9
|
-
"completed": "
|
|
10
|
-
"failed": "
|
|
11
|
-
"retrying": "
|
|
5
|
+
const statusMap = {
|
|
6
|
+
"pending": "PENDING",
|
|
7
|
+
"queued": "QUEUED",
|
|
8
|
+
"running": "RUNNING",
|
|
9
|
+
"completed": "COMPLETED",
|
|
10
|
+
"failed": "FAILED",
|
|
11
|
+
"retrying": "RETRYING"
|
|
12
12
|
};
|
|
13
|
-
const
|
|
13
|
+
const statusText = statusMap[job.status] || "UNKNOWN";
|
|
14
14
|
const barWidth = 20;
|
|
15
15
|
const filledWidth = Math.round((job.progress / 100) * barWidth);
|
|
16
16
|
const emptyWidth = barWidth - filledWidth;
|
|
17
17
|
const bar = "█".repeat(filledWidth) + "░".repeat(emptyWidth);
|
|
18
18
|
let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
|
|
19
19
|
output += `ID: ${job.id}\n`;
|
|
20
|
-
output += `Status: ${
|
|
20
|
+
output += `Status: ${statusText}\n`;
|
|
21
21
|
output += `Progress: ${bar} ${job.progress}%\n`;
|
|
22
22
|
output += `Activity: ${job.status_text}\n`;
|
|
23
23
|
if (job.result_url) {
|
|
24
|
-
output += `\
|
|
24
|
+
output += `\nResult: ${job.result_url}\n`;
|
|
25
25
|
}
|
|
26
26
|
if (job.error) {
|
|
27
|
-
output += `\
|
|
27
|
+
output += `\nERROR:\n`;
|
|
28
28
|
// Format multi-line errors nicely
|
|
29
29
|
const errorLines = job.error.split('\n');
|
|
30
30
|
errorLines.forEach(line => {
|
|
@@ -51,7 +51,7 @@ export function formatSearchResults(results) {
|
|
|
51
51
|
const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
|
|
52
52
|
const isOpen = openSources.includes(ds.source);
|
|
53
53
|
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
54
|
-
const accessBadge = isOpen ? "
|
|
54
|
+
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
55
55
|
// Safety indicator
|
|
56
56
|
let safetyIndicator = "";
|
|
57
57
|
if (ds.license.category === "safe") {
|
|
@@ -128,7 +128,7 @@ export function formatDatasetInfo(ds) {
|
|
|
128
128
|
const openSources = ["huggingface", "uci", "github", "worldbank", "nasa"];
|
|
129
129
|
const isOpen = openSources.includes(ds.source);
|
|
130
130
|
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
131
|
-
const accessBadge = isOpen ? "
|
|
131
|
+
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
132
132
|
let safetyIndicator = "";
|
|
133
133
|
if (ds.license.category === "safe") {
|
|
134
134
|
safetyIndicator = "Safe for use";
|
|
@@ -143,7 +143,7 @@ export function formatDatasetInfo(ds) {
|
|
|
143
143
|
output += `Safety: ${safetyIndicator}\n`;
|
|
144
144
|
output += `ID: ${ds.id}\n\n`;
|
|
145
145
|
if (!isOpen && ds.source === "kaggle") {
|
|
146
|
-
output +=
|
|
146
|
+
output += `NOTE: This dataset requires a Kaggle API key (KAGGE_USERNAME/KAGGLE_KEY) to prepare.\n\n`;
|
|
147
147
|
}
|
|
148
148
|
// Description
|
|
149
149
|
if (ds.description) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@vespermcp/mcp-server",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.1",
|
|
4
4
|
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "build/index.js",
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"mcp-config-template.json"
|
|
18
18
|
],
|
|
19
19
|
"scripts": {
|
|
20
|
-
"build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('
|
|
20
|
+
"build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});fs.readdirSync(src).forEach(f=>{if(f.endsWith('.py'))fs.copyFileSync(path.join(src,f),path.join(dest,f));});console.log('Copied Python scripts to build/python');\"",
|
|
21
21
|
"dev": "tsx watch src/index.ts",
|
|
22
22
|
"postinstall": "node scripts/postinstall.cjs",
|
|
23
23
|
"scrape": "tsx src/scripts/scrape-metadata.ts",
|