@vespermcp/mcp-server 1.2.21 → 1.2.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cache/service.js +7 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +441 -0
- package/build/index.js +1815 -839
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/arxiv-source.js +229 -0
- package/build/metadata/circuit-breaker.js +62 -0
- package/build/metadata/github-source.js +203 -0
- package/build/metadata/hackernews-source.js +123 -0
- package/build/metadata/quality.js +27 -0
- package/build/metadata/scraper.js +85 -14
- package/build/metadata/semantic-scholar-source.js +138 -0
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/scripts/test-phase1-webcore-quality.js +104 -0
- package/build/search/engine.js +45 -6
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/build/web/extract-web.js +297 -0
- package/build/web/fusion-engine.js +457 -0
- package/build/web/types.js +1 -0
- package/build/web/web-core.js +242 -0
- package/package.json +12 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +652 -0
- package/scripts/wizard.js +338 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
package/build/index.js
CHANGED
|
@@ -1,12 +1,39 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
// --- Dataset ID Normalization ---
|
|
3
3
|
function normalize_dataset_id(dataset_id) {
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
const trimmed = dataset_id.trim();
|
|
5
|
+
const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
|
|
6
|
+
let id = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
|
|
6
7
|
// Replace / and : with _ for filesystem safety
|
|
7
|
-
id = id.replace(/[
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
id = id.replace(/[\\/:]/g, "_");
|
|
9
|
+
if (!sourceMatch) {
|
|
10
|
+
return id;
|
|
11
|
+
}
|
|
12
|
+
const source = sourceMatch[1].toLowerCase() === "hf" ? "huggingface" : sourceMatch[1].toLowerCase();
|
|
13
|
+
return `${source}_${id}`;
|
|
14
|
+
}
|
|
15
|
+
function getDatasetIdAliases(dataset_id) {
|
|
16
|
+
const trimmed = dataset_id.trim();
|
|
17
|
+
const aliases = new Set([trimmed]);
|
|
18
|
+
const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
|
|
19
|
+
if (sourceMatch) {
|
|
20
|
+
const stripped = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
|
|
21
|
+
aliases.add(stripped);
|
|
22
|
+
if (sourceMatch[1].toLowerCase() === "hf") {
|
|
23
|
+
aliases.add(`huggingface:${stripped}`);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
aliases.add(`kaggle:${trimmed}`);
|
|
28
|
+
aliases.add(`huggingface:${trimmed}`);
|
|
29
|
+
aliases.add(`hf:${trimmed}`);
|
|
30
|
+
aliases.add(`openml:${trimmed}`);
|
|
31
|
+
aliases.add(`dataworld:${trimmed}`);
|
|
32
|
+
}
|
|
33
|
+
return Array.from(aliases);
|
|
34
|
+
}
|
|
35
|
+
function toSafeDatasetPathFragment(dataset_id) {
|
|
36
|
+
return normalize_dataset_id(dataset_id);
|
|
10
37
|
}
|
|
11
38
|
// --- Dataset Registry Helpers ---
|
|
12
39
|
function getRegistryPath() {
|
|
@@ -29,10 +56,11 @@ function writeRegistry(entries) {
|
|
|
29
56
|
fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
|
|
30
57
|
}
|
|
31
58
|
function upsertRegistry(dataset_id, local_path, status) {
|
|
32
|
-
const
|
|
59
|
+
const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
|
|
60
|
+
const norm_id = aliases[0];
|
|
33
61
|
console.error(`[Registry] Writing key: ${norm_id}`);
|
|
34
62
|
const entries = readRegistry();
|
|
35
|
-
const idx = entries.findIndex(e => e.dataset_id
|
|
63
|
+
const idx = entries.findIndex(e => aliases.includes(e.dataset_id || e.id));
|
|
36
64
|
if (idx >= 0) {
|
|
37
65
|
entries[idx] = { dataset_id: norm_id, local_path, status };
|
|
38
66
|
}
|
|
@@ -42,9 +70,163 @@ function upsertRegistry(dataset_id, local_path, status) {
|
|
|
42
70
|
writeRegistry(entries);
|
|
43
71
|
}
|
|
44
72
|
function getRegistryEntry(dataset_id) {
|
|
45
|
-
const
|
|
46
|
-
console.error(`[Registry] Lookup
|
|
47
|
-
return readRegistry().find(e => (e.dataset_id || e.id)
|
|
73
|
+
const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
|
|
74
|
+
console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
|
|
75
|
+
return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
|
|
76
|
+
}
|
|
77
|
+
const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
|
|
78
|
+
const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
|
|
79
|
+
function walkFilesRecursive(rootDir) {
|
|
80
|
+
const out = [];
|
|
81
|
+
const stack = [rootDir];
|
|
82
|
+
while (stack.length > 0) {
|
|
83
|
+
const currentDir = stack.pop();
|
|
84
|
+
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
|
|
85
|
+
for (const entry of entries) {
|
|
86
|
+
const fullPath = path.join(currentDir, entry.name);
|
|
87
|
+
if (entry.isDirectory()) {
|
|
88
|
+
stack.push(fullPath);
|
|
89
|
+
}
|
|
90
|
+
else if (entry.isFile()) {
|
|
91
|
+
out.push(fullPath);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
out.sort();
|
|
96
|
+
return out;
|
|
97
|
+
}
|
|
98
|
+
function inferImageManifestRecord(rootDir, fullPath, index) {
|
|
99
|
+
const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, "/");
|
|
100
|
+
const parentDir = path.posix.dirname(relativePath);
|
|
101
|
+
const parts = parentDir.split("/").filter(part => part && part !== ".");
|
|
102
|
+
let split;
|
|
103
|
+
let label;
|
|
104
|
+
if (parts.length > 0) {
|
|
105
|
+
const first = parts[0].toLowerCase();
|
|
106
|
+
if (["train", "test", "val", "valid", "validation"].includes(first)) {
|
|
107
|
+
split = parts[0];
|
|
108
|
+
if (parts.length > 1) {
|
|
109
|
+
label = parts[parts.length - 1];
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
label = parts[parts.length - 1];
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return {
|
|
117
|
+
id: index,
|
|
118
|
+
image_path: path.resolve(fullPath),
|
|
119
|
+
relative_path: relativePath,
|
|
120
|
+
file_name: path.basename(fullPath),
|
|
121
|
+
extension: path.extname(fullPath).toLowerCase().replace(/^\./, ""),
|
|
122
|
+
...(split ? { split } : {}),
|
|
123
|
+
...(label ? { label } : {}),
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
function createImageManifestFromDirectory(rootDir) {
|
|
127
|
+
const imageFiles = walkFilesRecursive(rootDir).filter(filePath => IMAGE_FILE_EXTENSIONS.has(path.extname(filePath).toLowerCase()));
|
|
128
|
+
if (imageFiles.length === 0) {
|
|
129
|
+
throw new Error(`No image files found under ${rootDir}`);
|
|
130
|
+
}
|
|
131
|
+
const manifestPath = path.join(rootDir, "_vesper_image_manifest.jsonl");
|
|
132
|
+
const lines = imageFiles.map((filePath, index) => JSON.stringify(inferImageManifestRecord(rootDir, filePath, index)));
|
|
133
|
+
fs.writeFileSync(manifestPath, `${lines.join("\n")}\n`, "utf-8");
|
|
134
|
+
return manifestPath;
|
|
135
|
+
}
|
|
136
|
+
function ensureExportableLocalPath(localPath) {
|
|
137
|
+
if (!fs.existsSync(localPath)) {
|
|
138
|
+
throw new Error(`Local path not found: ${localPath}`);
|
|
139
|
+
}
|
|
140
|
+
const stats = fs.statSync(localPath);
|
|
141
|
+
if (stats.isFile()) {
|
|
142
|
+
return localPath;
|
|
143
|
+
}
|
|
144
|
+
const manifestPath = path.join(localPath, "_vesper_image_manifest.jsonl");
|
|
145
|
+
if (fs.existsSync(manifestPath)) {
|
|
146
|
+
return manifestPath;
|
|
147
|
+
}
|
|
148
|
+
const candidates = walkFilesRecursive(localPath);
|
|
149
|
+
for (const ext of STRUCTURED_FILE_EXTENSIONS) {
|
|
150
|
+
const match = candidates.find(candidate => path.extname(candidate).toLowerCase() === ext);
|
|
151
|
+
if (match) {
|
|
152
|
+
return match;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return createImageManifestFromDirectory(localPath);
|
|
156
|
+
}
|
|
157
|
+
function isPathWithinDirectory(candidatePath, directoryPath) {
|
|
158
|
+
const relativePath = path.relative(path.resolve(directoryPath), path.resolve(candidatePath));
|
|
159
|
+
return relativePath === "" || (!relativePath.startsWith("..") && !path.isAbsolute(relativePath));
|
|
160
|
+
}
|
|
161
|
+
function buildDatasetCandidatePaths(baseDir, safeId) {
|
|
162
|
+
return [
|
|
163
|
+
path.join(baseDir, `${safeId}.parquet`),
|
|
164
|
+
path.join(baseDir, `${safeId}.csv`),
|
|
165
|
+
path.join(baseDir, `${safeId}.jsonl`),
|
|
166
|
+
path.join(baseDir, `${safeId}.json`),
|
|
167
|
+
path.join(baseDir, `${safeId}.feather`),
|
|
168
|
+
path.join(baseDir, `${safeId}.arrow`),
|
|
169
|
+
path.join(baseDir, safeId),
|
|
170
|
+
];
|
|
171
|
+
}
|
|
172
|
+
function shouldTrackExportPath(localPath) {
|
|
173
|
+
return isPathWithinDirectory(localPath, dataRoot);
|
|
174
|
+
}
|
|
175
|
+
function isDirectLocalDatasetReference(datasetIdOrPath) {
|
|
176
|
+
return fs.existsSync(datasetIdOrPath);
|
|
177
|
+
}
|
|
178
|
+
function getExportFileStem(datasetIdOrPath) {
|
|
179
|
+
if (isDirectLocalDatasetReference(datasetIdOrPath)) {
|
|
180
|
+
const resolvedPath = path.resolve(datasetIdOrPath);
|
|
181
|
+
const stats = fs.statSync(resolvedPath);
|
|
182
|
+
const baseName = stats.isDirectory()
|
|
183
|
+
? path.basename(resolvedPath)
|
|
184
|
+
: path.parse(resolvedPath).name;
|
|
185
|
+
return toSafeDatasetPathFragment(baseName);
|
|
186
|
+
}
|
|
187
|
+
return toSafeDatasetPathFragment(datasetIdOrPath);
|
|
188
|
+
}
|
|
189
|
+
function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
|
|
190
|
+
const resolvedTargetDir = path.resolve(targetDir);
|
|
191
|
+
const resolvedSourcePath = path.resolve(sourcePath);
|
|
192
|
+
if (path.dirname(resolvedSourcePath) === resolvedTargetDir) {
|
|
193
|
+
return resolvedSourcePath;
|
|
194
|
+
}
|
|
195
|
+
if (!fs.existsSync(resolvedTargetDir)) {
|
|
196
|
+
fs.mkdirSync(resolvedTargetDir, { recursive: true });
|
|
197
|
+
}
|
|
198
|
+
const stagedPath = path.join(resolvedTargetDir, `${toSafeDatasetPathFragment(datasetId)}${path.extname(resolvedSourcePath)}`);
|
|
199
|
+
if (resolvedSourcePath !== stagedPath) {
|
|
200
|
+
fs.copyFileSync(resolvedSourcePath, stagedPath);
|
|
201
|
+
}
|
|
202
|
+
return stagedPath;
|
|
203
|
+
}
|
|
204
|
+
function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
|
|
205
|
+
if (fs.existsSync(datasetIdOrPath)) {
|
|
206
|
+
return ensureExportableLocalPath(datasetIdOrPath);
|
|
207
|
+
}
|
|
208
|
+
const safeId = toSafeDatasetPathFragment(datasetIdOrPath);
|
|
209
|
+
const uniquePreferredDirs = Array.from(new Set(preferredDirs
|
|
210
|
+
.filter((dir) => typeof dir === "string" && dir.trim().length > 0)
|
|
211
|
+
.map(dir => path.resolve(dir))));
|
|
212
|
+
for (const preferredDir of uniquePreferredDirs) {
|
|
213
|
+
const localMatch = buildDatasetCandidatePaths(preferredDir, safeId).find(candidate => fs.existsSync(candidate));
|
|
214
|
+
if (localMatch) {
|
|
215
|
+
return ensureExportableLocalPath(localMatch);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
const downloadStatus = metadataStore.getDownloadStatus(datasetIdOrPath);
|
|
219
|
+
if (downloadStatus?.local_path && fs.existsSync(downloadStatus.local_path)) {
|
|
220
|
+
return ensureExportableLocalPath(downloadStatus.local_path);
|
|
221
|
+
}
|
|
222
|
+
const reg = getRegistryEntry(datasetIdOrPath);
|
|
223
|
+
const regPath = reg?.local_path || reg?.path;
|
|
224
|
+
if (regPath && fs.existsSync(regPath)) {
|
|
225
|
+
return ensureExportableLocalPath(regPath);
|
|
226
|
+
}
|
|
227
|
+
const rawCandidates = buildDatasetCandidatePaths(path.join(dataRoot, "data", "raw"), safeId);
|
|
228
|
+
const match = rawCandidates.find(candidate => fs.existsSync(candidate));
|
|
229
|
+
return match ? ensureExportableLocalPath(match) : undefined;
|
|
48
230
|
}
|
|
49
231
|
// --- Pipeline State Tracker ---
|
|
50
232
|
// Tracks completed steps per session/job/dataset
|
|
@@ -66,7 +248,7 @@ export function hasStep(datasetId, step) {
|
|
|
66
248
|
// --- Dataset ID Auto-Detection ---
|
|
67
249
|
export function parseDatasetId(id) {
|
|
68
250
|
const trimmed = id.trim();
|
|
69
|
-
if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
|
|
251
|
+
if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:|http|https):/i.test(trimmed))
|
|
70
252
|
return trimmed;
|
|
71
253
|
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
72
254
|
return `kaggle:${trimmed}`;
|
|
@@ -88,6 +270,14 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
|
88
270
|
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
89
271
|
import { OpenMLSource } from "./metadata/openml-source.js";
|
|
90
272
|
import { DataWorldSource } from "./metadata/dataworld-source.js";
|
|
273
|
+
import { ArxivSource } from "./metadata/arxiv-source.js";
|
|
274
|
+
import { GithubSource } from "./metadata/github-source.js";
|
|
275
|
+
import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
|
|
276
|
+
import { WebCoreEngine } from "./web/web-core.js";
|
|
277
|
+
import { WebFusionEngine } from "./web/fusion-engine.js";
|
|
278
|
+
import { WebExtractorEngine } from "./web/extract-web.js";
|
|
279
|
+
import { SemanticScholarSource } from "./metadata/semantic-scholar-source.js";
|
|
280
|
+
import { HackerNewsSource } from "./metadata/hackernews-source.js";
|
|
91
281
|
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
92
282
|
import { JobManager } from "./jobs/manager.js";
|
|
93
283
|
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
@@ -131,6 +321,34 @@ function logError(err, context) {
|
|
|
131
321
|
fs.appendFileSync(errorLogPath, msg);
|
|
132
322
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
133
323
|
}
|
|
324
|
+
// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
|
|
325
|
+
class RequestQueue {
|
|
326
|
+
queue = [];
|
|
327
|
+
running = false;
|
|
328
|
+
enqueue(task) {
|
|
329
|
+
return new Promise((resolve, reject) => {
|
|
330
|
+
this.queue.push({ resolve, reject, task });
|
|
331
|
+
this.drain();
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
async drain() {
|
|
335
|
+
if (this.running)
|
|
336
|
+
return;
|
|
337
|
+
this.running = true;
|
|
338
|
+
while (this.queue.length > 0) {
|
|
339
|
+
const item = this.queue.shift();
|
|
340
|
+
try {
|
|
341
|
+
const result = await item.task();
|
|
342
|
+
item.resolve(result);
|
|
343
|
+
}
|
|
344
|
+
catch (err) {
|
|
345
|
+
item.reject(err);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
this.running = false;
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
const requestQueue = new RequestQueue();
|
|
134
352
|
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
135
353
|
function printLaunchScreen() {
|
|
136
354
|
const screen = `
|
|
@@ -198,6 +416,21 @@ function extractRequestedRows(query, requirements) {
|
|
|
198
416
|
if (Number.isFinite(n) && n > 0)
|
|
199
417
|
return n;
|
|
200
418
|
}
|
|
419
|
+
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
420
|
+
.map(m => Number(m[0].replace(/,/g, "")))
|
|
421
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
422
|
+
if (commaNumbers.length > 0)
|
|
423
|
+
return Math.max(...commaNumbers);
|
|
424
|
+
const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
425
|
+
.map(m => {
|
|
426
|
+
const base = Number(m[1]);
|
|
427
|
+
const suffix = m[2].toLowerCase();
|
|
428
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
429
|
+
return Math.round(base * multiplier);
|
|
430
|
+
})
|
|
431
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
432
|
+
if (humanSized.length > 0)
|
|
433
|
+
return Math.max(...humanSized);
|
|
201
434
|
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
202
435
|
.map(m => Number(m[0]))
|
|
203
436
|
.filter(n => Number.isFinite(n) && n > 0);
|
|
@@ -367,7 +600,45 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
367
600
|
}
|
|
368
601
|
// Sync scripts immediately
|
|
369
602
|
syncPythonScripts(appRoot, dataRoot);
|
|
370
|
-
|
|
603
|
+
// Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
|
|
604
|
+
function tryRebuildSqlite() {
|
|
605
|
+
try {
|
|
606
|
+
const { execSync } = require("child_process");
|
|
607
|
+
const pkgRoot = path.resolve(__dirname, "..");
|
|
608
|
+
console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
|
|
609
|
+
execSync("npm rebuild better-sqlite3", {
|
|
610
|
+
stdio: "pipe",
|
|
611
|
+
timeout: 60000,
|
|
612
|
+
cwd: pkgRoot,
|
|
613
|
+
});
|
|
614
|
+
console.error("[Vesper] Rebuild succeeded. Retrying...");
|
|
615
|
+
// Clear require cache so the rebuilt module is loaded
|
|
616
|
+
for (const key of Object.keys(require.cache)) {
|
|
617
|
+
if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
|
|
618
|
+
delete require.cache[key];
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
return true;
|
|
622
|
+
}
|
|
623
|
+
catch (e) {
|
|
624
|
+
console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
|
|
625
|
+
return false;
|
|
626
|
+
}
|
|
627
|
+
}
|
|
628
|
+
let metadataStore;
|
|
629
|
+
try {
|
|
630
|
+
metadataStore = new MetadataStore(dbPath);
|
|
631
|
+
}
|
|
632
|
+
catch (e) {
|
|
633
|
+
if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
|
|
634
|
+
metadataStore = new MetadataStore(dbPath);
|
|
635
|
+
}
|
|
636
|
+
else {
|
|
637
|
+
console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
|
|
638
|
+
console.error("[Vesper] Run: npm rebuild better-sqlite3");
|
|
639
|
+
throw e;
|
|
640
|
+
}
|
|
641
|
+
}
|
|
371
642
|
const vectorStore = new VectorStore(vectorPath);
|
|
372
643
|
const embedder = Embedder.getInstance();
|
|
373
644
|
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
@@ -382,7 +653,16 @@ const dataSplitter = new DataSplitter(__dirname);
|
|
|
382
653
|
const dataExporter = new DataExporter(__dirname);
|
|
383
654
|
const fusionEngine = new DataFusionEngine(__dirname);
|
|
384
655
|
const kaggleSource = new KaggleSource(__dirname);
|
|
656
|
+
const openmlSource = new OpenMLSource(__dirname);
|
|
657
|
+
const dataworldSource = new DataWorldSource(__dirname);
|
|
658
|
+
const arxivSource = new ArxivSource(cacheService);
|
|
659
|
+
const githubSource = new GithubSource(cacheService);
|
|
385
660
|
const secureKeys = new SecureKeysManager(__dirname);
|
|
661
|
+
const semanticScholarSource = new SemanticScholarSource(cacheService);
|
|
662
|
+
const hackerNewsSource = new HackerNewsSource(cacheService);
|
|
663
|
+
const webCoreEngine = new WebCoreEngine({ arxivSource, githubSource, semanticScholarSource, hackerNewsSource });
|
|
664
|
+
const webFusionEngine = new WebFusionEngine({ webCoreEngine, embedder, cache: cacheService });
|
|
665
|
+
const webExtractorEngine = new WebExtractorEngine(cacheService);
|
|
386
666
|
function hydrateExternalKeys() {
|
|
387
667
|
const keys = secureKeys.getAll();
|
|
388
668
|
if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
|
|
@@ -401,6 +681,17 @@ function hydrateExternalKeys() {
|
|
|
401
681
|
function hasDataWorldToken() {
|
|
402
682
|
return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
|
|
403
683
|
}
|
|
684
|
+
const unifiedDatasetGateway = new UnifiedDatasetGateway({
|
|
685
|
+
metadataStore,
|
|
686
|
+
dataIngestor,
|
|
687
|
+
dataRoot,
|
|
688
|
+
kaggleSource,
|
|
689
|
+
openmlSource,
|
|
690
|
+
dataworldSource,
|
|
691
|
+
arxivSource,
|
|
692
|
+
githubSource,
|
|
693
|
+
hasDataWorldToken,
|
|
694
|
+
});
|
|
404
695
|
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
405
696
|
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
406
697
|
// NOT relative to project root (appRoot)
|
|
@@ -432,7 +723,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
432
723
|
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
433
724
|
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
434
725
|
switch (job.type) {
|
|
435
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
|
|
726
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
|
|
436
727
|
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
437
728
|
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
438
729
|
}
|
|
@@ -450,9 +741,21 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
450
741
|
/**
|
|
451
742
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
452
743
|
*/
|
|
453
|
-
async function handlePrepareJob(jobId, query, requirements) {
|
|
744
|
+
async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
454
745
|
hydrateExternalKeys();
|
|
455
746
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
747
|
+
const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
|
|
748
|
+
const stepStatus = {};
|
|
749
|
+
for (const s of pipelineSteps)
|
|
750
|
+
stepStatus[s] = "pending";
|
|
751
|
+
const markPipelineStep = (step, status) => {
|
|
752
|
+
stepStatus[step] = status;
|
|
753
|
+
const summary = pipelineSteps.map(s => {
|
|
754
|
+
const st = stepStatus[s];
|
|
755
|
+
return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
|
|
756
|
+
}).join(" → ");
|
|
757
|
+
console.error(`[Pipeline] ${summary}`);
|
|
758
|
+
};
|
|
456
759
|
// Ensure core Python packages are available for dataset operations
|
|
457
760
|
try {
|
|
458
761
|
await ensurePythonModules([
|
|
@@ -465,11 +768,12 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
465
768
|
// Continue anyway - direct file downloads may still work without datasets lib
|
|
466
769
|
}
|
|
467
770
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
771
|
+
const searchQuery = requirements ? `${query} ${requirements}` : query;
|
|
468
772
|
let selectedDataset;
|
|
469
773
|
let datasetIdForDownload = "";
|
|
470
774
|
let source;
|
|
471
775
|
const parsedQuery = parseDatasetId(query);
|
|
472
|
-
const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
|
|
776
|
+
const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:|arxiv:|github:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
|
|
473
777
|
if (isExplicitDatasetRef) {
|
|
474
778
|
let explicitId = parsedQuery;
|
|
475
779
|
if (/^hf:/i.test(explicitId)) {
|
|
@@ -491,6 +795,12 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
491
795
|
source = "dataworld";
|
|
492
796
|
datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
|
|
493
797
|
}
|
|
798
|
+
else if (/^arxiv:/i.test(explicitId)) {
|
|
799
|
+
throw new Error("prepare_dataset does not support direct arXiv downloads yet. Use unified_dataset_api with operation='discover' or 'info' for arXiv.");
|
|
800
|
+
}
|
|
801
|
+
else if (/^github:/i.test(explicitId)) {
|
|
802
|
+
throw new Error("prepare_dataset does not support direct GitHub downloads yet. Use unified_dataset_api with operation='discover' or 'info' for GitHub.");
|
|
803
|
+
}
|
|
494
804
|
else {
|
|
495
805
|
// Default to HuggingFace for ambiguous refs (user/dataset without prefix)
|
|
496
806
|
source = "huggingface";
|
|
@@ -500,11 +810,14 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
500
810
|
progress: 20,
|
|
501
811
|
status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
|
|
502
812
|
});
|
|
813
|
+
markPipelineStep("search", "skipped");
|
|
503
814
|
}
|
|
504
815
|
else {
|
|
816
|
+
markPipelineStep("search", "running");
|
|
505
817
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
506
|
-
const results = await searchEngine.search(
|
|
818
|
+
const results = await searchEngine.search(searchQuery, { limit: 10 });
|
|
507
819
|
if (results.length === 0) {
|
|
820
|
+
markPipelineStep("search", "failed");
|
|
508
821
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
509
822
|
}
|
|
510
823
|
// Pick the best result that we can actually download (skip sources requiring missing credentials)
|
|
@@ -512,20 +825,32 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
512
825
|
const hasDwToken = hasDataWorldToken();
|
|
513
826
|
selectedDataset = results.find(r => {
|
|
514
827
|
const s = (r.source || "").toLowerCase();
|
|
828
|
+
if (s === "arxiv")
|
|
829
|
+
return false; // Phase 1: discover/info only, no direct download yet
|
|
830
|
+
if (s === "github")
|
|
831
|
+
return false; // Phase 1: discover/info only, no direct download yet
|
|
515
832
|
if (s === "kaggle" && !hasKaggleCreds)
|
|
516
833
|
return false;
|
|
517
834
|
if (s === "dataworld" && !hasDwToken)
|
|
518
835
|
return false;
|
|
519
836
|
return true;
|
|
520
837
|
}) || results[0]; // Fallback to first if all require credentials
|
|
838
|
+
if ((selectedDataset.source || "").toLowerCase() === "arxiv") {
|
|
839
|
+
throw new Error("Matched an arXiv paper, but prepare_dataset currently supports downloadable dataset providers only.");
|
|
840
|
+
}
|
|
841
|
+
if ((selectedDataset.source || "").toLowerCase() === "github") {
|
|
842
|
+
throw new Error("Matched a GitHub repo, but prepare_dataset currently supports downloadable dataset providers only.");
|
|
843
|
+
}
|
|
521
844
|
datasetIdForDownload = selectedDataset.id;
|
|
522
845
|
source = selectedDataset.source;
|
|
523
846
|
update({
|
|
524
847
|
progress: 20,
|
|
525
848
|
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
526
849
|
});
|
|
850
|
+
markPipelineStep("search", "done");
|
|
527
851
|
}
|
|
528
852
|
// Pre-check credentials for sources that require them
|
|
853
|
+
markPipelineStep("validate", "running");
|
|
529
854
|
if (source === "kaggle") {
|
|
530
855
|
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
531
856
|
if (!hasKaggleCreds) {
|
|
@@ -533,8 +858,11 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
533
858
|
}
|
|
534
859
|
}
|
|
535
860
|
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
861
|
+
markPipelineStep("validate", "failed");
|
|
536
862
|
throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
|
|
537
863
|
}
|
|
864
|
+
markPipelineStep("validate", "done");
|
|
865
|
+
markPipelineStep("download", "running");
|
|
538
866
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
539
867
|
// ensureData handles download and returns path to the raw file
|
|
540
868
|
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
@@ -545,7 +873,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
545
873
|
let currentRows = await countRows(rawFilePath);
|
|
546
874
|
if (currentRows < requestedRows) {
|
|
547
875
|
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
548
|
-
const additional = await searchEngine.search(
|
|
876
|
+
const additional = await searchEngine.search(searchQuery, { limit: 8 });
|
|
549
877
|
const sourceFiles = [rawFilePath];
|
|
550
878
|
let totalRows = currentRows;
|
|
551
879
|
for (const ds of additional) {
|
|
@@ -597,15 +925,50 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
597
925
|
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
598
926
|
}
|
|
599
927
|
}
|
|
928
|
+
markPipelineStep("download", "done");
|
|
929
|
+
// ── Normalize step: convert any raw format → parquet ──
|
|
930
|
+
markPipelineStep("normalize", "running");
|
|
931
|
+
const rawExt = path.extname(rawFilePath).toLowerCase();
|
|
932
|
+
if (rawExt !== ".parquet" && rawExt !== ".pq") {
|
|
933
|
+
update({ progress: 70, status_text: "Normalizing to parquet..." });
|
|
934
|
+
const normalizedDir = path.join(dataRoot, "data", "normalized");
|
|
935
|
+
if (!fs.existsSync(normalizedDir))
|
|
936
|
+
fs.mkdirSync(normalizedDir, { recursive: true });
|
|
937
|
+
const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
|
|
938
|
+
const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
|
|
939
|
+
try {
|
|
940
|
+
const normScript = path.join(dataRoot, "python", "normalize_engine.py");
|
|
941
|
+
const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
|
|
942
|
+
if (normResult.ok && fs.existsSync(normalizedPath)) {
|
|
943
|
+
console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
|
|
944
|
+
rawFilePath = normalizedPath;
|
|
945
|
+
markPipelineStep("normalize", "done");
|
|
946
|
+
}
|
|
947
|
+
else {
|
|
948
|
+
console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
|
|
949
|
+
markPipelineStep("normalize", "skipped");
|
|
950
|
+
}
|
|
951
|
+
}
|
|
952
|
+
catch (e) {
|
|
953
|
+
console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
|
|
954
|
+
markPipelineStep("normalize", "skipped");
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
else {
|
|
958
|
+
markPipelineStep("normalize", "done");
|
|
959
|
+
}
|
|
600
960
|
let qualityScore = selectedDataset?.quality_score ?? 70;
|
|
601
|
-
|
|
961
|
+
markPipelineStep("quality", "running");
|
|
962
|
+
update({ progress: 75, status_text: "Analyzing dataset quality..." });
|
|
602
963
|
try {
|
|
603
964
|
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
604
965
|
qualityScore = report.overall_score;
|
|
966
|
+
markPipelineStep("quality", "done");
|
|
605
967
|
}
|
|
606
968
|
catch (error) {
|
|
607
969
|
console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
|
|
608
970
|
update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
|
|
971
|
+
markPipelineStep("quality", "skipped");
|
|
609
972
|
}
|
|
610
973
|
if (selectedDataset) {
|
|
611
974
|
metadataStore.saveDataset({
|
|
@@ -613,15 +976,62 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
613
976
|
quality_score: qualityScore
|
|
614
977
|
});
|
|
615
978
|
}
|
|
979
|
+
else {
|
|
980
|
+
// Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
|
|
981
|
+
try {
|
|
982
|
+
const existingMeta = metadataStore.getDataset(datasetIdForDownload);
|
|
983
|
+
if (!existingMeta) {
|
|
984
|
+
metadataStore.saveDataset({
|
|
985
|
+
id: datasetIdForDownload,
|
|
986
|
+
source: source,
|
|
987
|
+
name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
|
|
988
|
+
description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
|
|
989
|
+
quality_warnings: [],
|
|
990
|
+
downloads: 0,
|
|
991
|
+
likes: 0,
|
|
992
|
+
stars: 0,
|
|
993
|
+
tags: [],
|
|
994
|
+
last_updated: new Date().toISOString(),
|
|
995
|
+
task: "unknown",
|
|
996
|
+
domain: "unknown",
|
|
997
|
+
languages: [],
|
|
998
|
+
splits: [],
|
|
999
|
+
license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
|
|
1000
|
+
quality_score: qualityScore,
|
|
1001
|
+
download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
|
|
1002
|
+
total_examples: 0,
|
|
1003
|
+
is_structured: false,
|
|
1004
|
+
has_target_column: false,
|
|
1005
|
+
is_safe_source: true,
|
|
1006
|
+
has_personal_data: false,
|
|
1007
|
+
is_paywalled: false,
|
|
1008
|
+
is_scraped_web_data: false,
|
|
1009
|
+
uses_https: true,
|
|
1010
|
+
has_train_split: false,
|
|
1011
|
+
has_test_split: false,
|
|
1012
|
+
has_validation_split: false,
|
|
1013
|
+
description_length: 0,
|
|
1014
|
+
has_readme: false,
|
|
1015
|
+
});
|
|
1016
|
+
}
|
|
1017
|
+
}
|
|
1018
|
+
catch (e) {
|
|
1019
|
+
console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
1020
|
+
}
|
|
1021
|
+
}
|
|
1022
|
+
markPipelineStep("register", "running");
|
|
616
1023
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
617
|
-
const installPath = await installService.install(datasetIdForDownload, rawFilePath);
|
|
1024
|
+
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
618
1025
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
619
1026
|
// Register prepared dataset in local registry for lookup by export/list tools
|
|
620
1027
|
try {
|
|
621
1028
|
upsertRegistry(datasetIdForDownload, installPath, "completed");
|
|
1029
|
+
markPipelineStep("register", "done");
|
|
1030
|
+
markStepComplete(datasetIdForDownload, "prepare");
|
|
622
1031
|
}
|
|
623
1032
|
catch (e) {
|
|
624
1033
|
console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
1034
|
+
markPipelineStep("register", "failed");
|
|
625
1035
|
}
|
|
626
1036
|
return installPath;
|
|
627
1037
|
}
|
|
@@ -647,7 +1057,7 @@ async function handleCleanJob(jobId, datasetId, ops) {
|
|
|
647
1057
|
}
|
|
648
1058
|
// 3. Check standard raw data paths
|
|
649
1059
|
if (!filePath) {
|
|
650
|
-
const safeId = datasetId
|
|
1060
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
651
1061
|
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
652
1062
|
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
653
1063
|
const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
|
|
@@ -712,9 +1122,146 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
712
1122
|
required: ["query"],
|
|
713
1123
|
},
|
|
714
1124
|
},
|
|
1125
|
+
{
|
|
1126
|
+
name: "unified_dataset_api",
|
|
1127
|
+
description: "Single facade over multiple external dataset providers. Supports provider discovery, dataset search, dataset download, and dataset info through one MCP tool using public access and server-managed credentials when available.",
|
|
1128
|
+
inputSchema: {
|
|
1129
|
+
type: "object",
|
|
1130
|
+
properties: {
|
|
1131
|
+
operation: {
|
|
1132
|
+
type: "string",
|
|
1133
|
+
enum: ["providers", "discover", "download", "info"],
|
|
1134
|
+
description: "Gateway operation to execute.",
|
|
1135
|
+
},
|
|
1136
|
+
source: {
|
|
1137
|
+
type: "string",
|
|
1138
|
+
enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "arxiv", "github", "s3", "bigquery"],
|
|
1139
|
+
description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
|
|
1140
|
+
},
|
|
1141
|
+
query: {
|
|
1142
|
+
type: "string",
|
|
1143
|
+
description: "Dataset discovery query. Required for operation='discover'.",
|
|
1144
|
+
},
|
|
1145
|
+
dataset_id: {
|
|
1146
|
+
type: "string",
|
|
1147
|
+
description: "Dataset identifier or object reference. Required for operation='download' and operation='info'. Supports prefixed ids like 'huggingface:user/dataset' and public S3 URIs like 's3://bucket/key'.",
|
|
1148
|
+
},
|
|
1149
|
+
limit: {
|
|
1150
|
+
type: "number",
|
|
1151
|
+
description: "Max results for operation='discover' (default: 10).",
|
|
1152
|
+
},
|
|
1153
|
+
target_dir: {
|
|
1154
|
+
type: "string",
|
|
1155
|
+
description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
|
|
1156
|
+
},
|
|
1157
|
+
output_dir: {
|
|
1158
|
+
type: "string",
|
|
1159
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1160
|
+
},
|
|
1161
|
+
public_only: {
|
|
1162
|
+
type: "boolean",
|
|
1163
|
+
description: "When true, discover/info stay on public providers only unless a specific source is requested.",
|
|
1164
|
+
},
|
|
1165
|
+
include_unavailable: {
|
|
1166
|
+
type: "boolean",
|
|
1167
|
+
description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
|
|
1168
|
+
},
|
|
1169
|
+
},
|
|
1170
|
+
required: ["operation"],
|
|
1171
|
+
},
|
|
1172
|
+
},
|
|
1173
|
+
{
|
|
1174
|
+
name: "vesper_web_find",
|
|
1175
|
+
description: "Phase 1 Web Core: search web-native sources (ArXiv, GitHub) and return structured, validated documents using a unified schema (source_type, source_url, content, metadata_json, quality_score, collected_at, content_type).",
|
|
1176
|
+
inputSchema: {
|
|
1177
|
+
type: "object",
|
|
1178
|
+
properties: {
|
|
1179
|
+
query: { type: "string", description: "Natural language query, e.g. 'agentic RAG evaluation'" },
|
|
1180
|
+
sources: {
|
|
1181
|
+
type: "array",
|
|
1182
|
+
items: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews"] },
|
|
1183
|
+
description: "Optional subset of sources. Defaults to ['arxiv','github'] when omitted.",
|
|
1184
|
+
},
|
|
1185
|
+
limit: { type: "number", description: "Max documents to return (default 10, max 50)." },
|
|
1186
|
+
arxiv_full_text: { type: "boolean", description: "When true, fetch and parse ArXiv PDFs and return full text as document content (slower)." },
|
|
1187
|
+
github_include_readme: { type: "boolean", description: "When true, fetch and include GitHub README.md text as document content (slower)." },
|
|
1188
|
+
},
|
|
1189
|
+
required: ["query"],
|
|
1190
|
+
},
|
|
1191
|
+
},
|
|
1192
|
+
{
|
|
1193
|
+
name: "vesper.fuse",
|
|
1194
|
+
description: "Phase 2 Data Fusion: fuse results from multiple web-native sources into one unified, deduplicated corpus (provenance via source_chain).",
|
|
1195
|
+
inputSchema: {
|
|
1196
|
+
type: "object",
|
|
1197
|
+
properties: {
|
|
1198
|
+
sources: {
|
|
1199
|
+
type: "array",
|
|
1200
|
+
description: "Web sources to collect from, each with its own query.",
|
|
1201
|
+
items: {
|
|
1202
|
+
type: "object",
|
|
1203
|
+
properties: {
|
|
1204
|
+
type: { type: "string", enum: ["arxiv", "github", "semantic_scholar", "hackernews", "s3"] },
|
|
1205
|
+
query: { type: "string", description: "Query for this source." },
|
|
1206
|
+
max_results: { type: "number", description: "Max results for this source (optional)." },
|
|
1207
|
+
min_stars: { type: "number", description: "Optional popularity filter (GitHub) based on stars/proxy fields." },
|
|
1208
|
+
bucket: { type: "string", description: "S3 bucket (for type='s3')." },
|
|
1209
|
+
path: { type: "string", description: "S3 prefix/path (for type='s3')." },
|
|
1210
|
+
region: { type: "string", description: "AWS region (for type='s3')." },
|
|
1211
|
+
credentials: {
|
|
1212
|
+
type: "object",
|
|
1213
|
+
description: "Pass-through AWS credentials (optional; not persisted).",
|
|
1214
|
+
properties: {
|
|
1215
|
+
accessKeyId: { type: "string" },
|
|
1216
|
+
secretAccessKey: { type: "string" },
|
|
1217
|
+
sessionToken: { type: "string" },
|
|
1218
|
+
roleArn: { type: "string" },
|
|
1219
|
+
}
|
|
1220
|
+
},
|
|
1221
|
+
},
|
|
1222
|
+
required: ["type", "query"],
|
|
1223
|
+
},
|
|
1224
|
+
},
|
|
1225
|
+
merge_strategy: {
|
|
1226
|
+
type: "string",
|
|
1227
|
+
enum: ["union", "dedup"],
|
|
1228
|
+
description: "How to merge collected documents.",
|
|
1229
|
+
},
|
|
1230
|
+
deduplication: {
|
|
1231
|
+
type: "string",
|
|
1232
|
+
enum: ["semantic", "exact", "none"],
|
|
1233
|
+
description: "How to deduplicate across sources.",
|
|
1234
|
+
},
|
|
1235
|
+
},
|
|
1236
|
+
required: ["sources"],
|
|
1237
|
+
},
|
|
1238
|
+
},
|
|
1239
|
+
{
|
|
1240
|
+
name: "vesper.extract_web",
|
|
1241
|
+
description: "Phase 3 Structured Web extraction. Whitelist-only domains, deterministic extraction (tables/lists/infobox), schema validation, and cache fallback on live extraction failure.",
|
|
1242
|
+
inputSchema: {
|
|
1243
|
+
type: "object",
|
|
1244
|
+
properties: {
|
|
1245
|
+
url: { type: "string", description: "Target URL from approved whitelist domains." },
|
|
1246
|
+
mode: { type: "string", enum: ["auto", "table", "list", "infobox"], description: "Extraction mode (default auto)." },
|
|
1247
|
+
strict_schema: { type: "boolean", description: "When true (default), enforce domain-specific required fields." },
|
|
1248
|
+
schema: {
|
|
1249
|
+
type: "object",
|
|
1250
|
+
properties: {
|
|
1251
|
+
required_fields: {
|
|
1252
|
+
type: "array",
|
|
1253
|
+
items: { type: "string" },
|
|
1254
|
+
description: "Optional required top-level fields in extracted data payload."
|
|
1255
|
+
}
|
|
1256
|
+
}
|
|
1257
|
+
}
|
|
1258
|
+
},
|
|
1259
|
+
required: ["url"],
|
|
1260
|
+
},
|
|
1261
|
+
},
|
|
715
1262
|
{
|
|
716
1263
|
name: "discover_datasets",
|
|
717
|
-
description: "Discover datasets from a specific source.
|
|
1264
|
+
description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
|
|
718
1265
|
inputSchema: {
|
|
719
1266
|
type: "object",
|
|
720
1267
|
properties: {
|
|
@@ -724,7 +1271,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
724
1271
|
},
|
|
725
1272
|
source: {
|
|
726
1273
|
type: "string",
|
|
727
|
-
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
1274
|
+
enum: ["huggingface", "kaggle", "openml", "dataworld", "arxiv", "github"],
|
|
728
1275
|
description: "Data source to discover from.",
|
|
729
1276
|
},
|
|
730
1277
|
limit: {
|
|
@@ -737,7 +1284,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
737
1284
|
},
|
|
738
1285
|
{
|
|
739
1286
|
name: "download_dataset",
|
|
740
|
-
description: "Download a dataset by source and ID/slug into local
|
|
1287
|
+
description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
|
|
741
1288
|
inputSchema: {
|
|
742
1289
|
type: "object",
|
|
743
1290
|
properties: {
|
|
@@ -752,7 +1299,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
752
1299
|
},
|
|
753
1300
|
target_dir: {
|
|
754
1301
|
type: "string",
|
|
755
|
-
description: "Optional target directory for downloaded files.",
|
|
1302
|
+
description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
|
|
1303
|
+
},
|
|
1304
|
+
output_dir: {
|
|
1305
|
+
type: "string",
|
|
1306
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
756
1307
|
}
|
|
757
1308
|
},
|
|
758
1309
|
required: ["dataset_id"],
|
|
@@ -770,6 +1321,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
770
1321
|
kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
|
|
771
1322
|
urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
|
|
772
1323
|
output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
|
|
1324
|
+
target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
|
|
1325
|
+
output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
|
|
773
1326
|
max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
|
|
774
1327
|
workers: { type: "number", description: "Parallel worker count (default 8)." },
|
|
775
1328
|
image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
|
|
@@ -877,6 +1430,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
877
1430
|
properties: {
|
|
878
1431
|
query: { type: "string" },
|
|
879
1432
|
requirements: { type: "string" },
|
|
1433
|
+
target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
|
|
1434
|
+
output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
|
|
880
1435
|
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
881
1436
|
cleaning_options: { type: "object" },
|
|
882
1437
|
split_config: { type: "object" },
|
|
@@ -921,7 +1476,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
921
1476
|
},
|
|
922
1477
|
target_dir: {
|
|
923
1478
|
type: "string",
|
|
924
|
-
description: "Optional custom local directory for export
|
|
1479
|
+
description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
|
|
1480
|
+
},
|
|
1481
|
+
output_dir: {
|
|
1482
|
+
type: "string",
|
|
1483
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
925
1484
|
},
|
|
926
1485
|
format: {
|
|
927
1486
|
type: "string",
|
|
@@ -962,6 +1521,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
962
1521
|
properties: {},
|
|
963
1522
|
},
|
|
964
1523
|
},
|
|
1524
|
+
{
|
|
1525
|
+
name: "vesper_convert_format",
|
|
1526
|
+
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
1527
|
+
inputSchema: {
|
|
1528
|
+
type: "object",
|
|
1529
|
+
properties: {
|
|
1530
|
+
file_path: {
|
|
1531
|
+
type: "string",
|
|
1532
|
+
description: "Absolute path to the input dataset file.",
|
|
1533
|
+
},
|
|
1534
|
+
target_format: {
|
|
1535
|
+
type: "string",
|
|
1536
|
+
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1537
|
+
description: "The desired output format.",
|
|
1538
|
+
},
|
|
1539
|
+
},
|
|
1540
|
+
required: ["file_path", "target_format"],
|
|
1541
|
+
},
|
|
1542
|
+
},
|
|
965
1543
|
{
|
|
966
1544
|
name: "fuse_datasets",
|
|
967
1545
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -1069,925 +1647,1225 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1069
1647
|
],
|
|
1070
1648
|
};
|
|
1071
1649
|
});
|
|
1072
|
-
// Call Tool
|
|
1650
|
+
// Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
|
|
1073
1651
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1652
|
+
return requestQueue.enqueue(async () => {
|
|
1653
|
+
// --- Pipeline Enforcement ---
|
|
1654
|
+
// Map tool names to pipeline steps
|
|
1655
|
+
const toolToStep = {
|
|
1656
|
+
vesper_search: "search",
|
|
1657
|
+
vesper_download: "download",
|
|
1658
|
+
vesper_analyze: "analyze",
|
|
1659
|
+
vesper_clean: "clean",
|
|
1660
|
+
vesper_split: "split",
|
|
1661
|
+
vesper_export: "export",
|
|
1662
|
+
prepare_dataset: "prepare",
|
|
1663
|
+
};
|
|
1664
|
+
// Extract dataset_id if present and normalize
|
|
1665
|
+
let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
|
|
1666
|
+
if (datasetId)
|
|
1667
|
+
datasetId = parseDatasetId(String(datasetId));
|
|
1668
|
+
// Pipeline rules
|
|
1669
|
+
const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
|
|
1670
|
+
const prereqs = {
|
|
1671
|
+
vesper_download: ["search"],
|
|
1672
|
+
vesper_analyze: ["download"],
|
|
1673
|
+
vesper_clean: ["analyze"],
|
|
1674
|
+
vesper_split: ["clean"],
|
|
1675
|
+
vesper_export: ["split"],
|
|
1676
|
+
};
|
|
1677
|
+
const tool = String(request.params.name);
|
|
1678
|
+
const step = toolToStep[tool];
|
|
1679
|
+
if (step && datasetId) {
|
|
1680
|
+
// Check prerequisites
|
|
1681
|
+
const required = prereqs[tool] || [];
|
|
1682
|
+
for (const req of required) {
|
|
1683
|
+
if (!hasStep(String(datasetId), req)) {
|
|
1684
|
+
// Auto-run missing step if possible, else error
|
|
1685
|
+
// For export, auto-run prepare_dataset if split missing
|
|
1686
|
+
if (tool === "vesper_export" && req === "split") {
|
|
1687
|
+
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1688
|
+
try {
|
|
1689
|
+
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
1690
|
+
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1691
|
+
markStepComplete(String(datasetId), "split");
|
|
1692
|
+
}
|
|
1693
|
+
catch (e) {
|
|
1694
|
+
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1695
|
+
return {
|
|
1696
|
+
content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
|
|
1697
|
+
isError: true,
|
|
1698
|
+
};
|
|
1699
|
+
}
|
|
1113
1700
|
}
|
|
1114
|
-
|
|
1115
|
-
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1701
|
+
else {
|
|
1116
1702
|
return {
|
|
1117
|
-
content: [{ type: "text", text: `ERROR:
|
|
1703
|
+
content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
|
|
1118
1704
|
isError: true,
|
|
1119
1705
|
};
|
|
1120
1706
|
}
|
|
1121
1707
|
}
|
|
1122
|
-
|
|
1708
|
+
}
|
|
1709
|
+
// Mark this step as complete
|
|
1710
|
+
markStepComplete(String(datasetId), String(step));
|
|
1711
|
+
}
|
|
1712
|
+
switch (request.params.name) {
|
|
1713
|
+
case "vesper_web_find": {
|
|
1714
|
+
hydrateExternalKeys();
|
|
1715
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1716
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
1717
|
+
const sources = Array.isArray(request.params.arguments?.sources)
|
|
1718
|
+
? (request.params.arguments?.sources).map(s => String(s).trim().toLowerCase()).filter(Boolean)
|
|
1719
|
+
: undefined;
|
|
1720
|
+
try {
|
|
1721
|
+
const result = await webCoreEngine.find({
|
|
1722
|
+
query,
|
|
1723
|
+
sources: sources,
|
|
1724
|
+
limit,
|
|
1725
|
+
arxiv_full_text: request.params.arguments?.arxiv_full_text === true,
|
|
1726
|
+
github_include_readme: request.params.arguments?.github_include_readme === true,
|
|
1727
|
+
});
|
|
1123
1728
|
return {
|
|
1124
|
-
content: [{ type: "text", text:
|
|
1729
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1730
|
+
};
|
|
1731
|
+
}
|
|
1732
|
+
catch (error) {
|
|
1733
|
+
return {
|
|
1734
|
+
content: [{ type: "text", text: `ERROR: web_find failed: ${error.message}` }],
|
|
1125
1735
|
isError: true,
|
|
1126
1736
|
};
|
|
1127
1737
|
}
|
|
1128
1738
|
}
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1739
|
+
case "vesper.fuse": {
|
|
1740
|
+
hydrateExternalKeys();
|
|
1741
|
+
const sources = Array.isArray(request.params.arguments?.sources)
|
|
1742
|
+
? request.params.arguments?.sources
|
|
1743
|
+
: undefined;
|
|
1744
|
+
if (!sources || !Array.isArray(sources)) {
|
|
1745
|
+
return {
|
|
1746
|
+
content: [{ type: "text", text: "ERROR: vesper.fuse requires 'sources' array." }],
|
|
1747
|
+
isError: true,
|
|
1748
|
+
};
|
|
1749
|
+
}
|
|
1750
|
+
try {
|
|
1751
|
+
const mergeStrategyRaw = request.params.arguments?.merge_strategy
|
|
1752
|
+
? String(request.params.arguments?.merge_strategy).toLowerCase()
|
|
1753
|
+
: undefined;
|
|
1754
|
+
const dedupRaw = request.params.arguments?.deduplication
|
|
1755
|
+
? String(request.params.arguments?.deduplication).toLowerCase()
|
|
1756
|
+
: undefined;
|
|
1757
|
+
const merge_strategy = mergeStrategyRaw && ["union", "dedup"].includes(mergeStrategyRaw)
|
|
1758
|
+
? mergeStrategyRaw
|
|
1759
|
+
: undefined;
|
|
1760
|
+
const deduplication = dedupRaw && ["semantic", "exact", "none"].includes(dedupRaw)
|
|
1761
|
+
? dedupRaw
|
|
1762
|
+
: undefined;
|
|
1763
|
+
const result = await webFusionEngine.fuse({
|
|
1764
|
+
sources: sources.map((s) => ({
|
|
1765
|
+
type: String(s?.type || "").trim().toLowerCase(),
|
|
1766
|
+
query: String(s?.query || "").trim(),
|
|
1767
|
+
max_results: s?.max_results !== undefined ? Number(s.max_results) : undefined,
|
|
1768
|
+
min_stars: s?.min_stars !== undefined ? Number(s.min_stars) : undefined,
|
|
1769
|
+
bucket: s?.bucket !== undefined ? String(s.bucket) : undefined,
|
|
1770
|
+
path: s?.path !== undefined ? String(s.path) : undefined,
|
|
1771
|
+
region: s?.region !== undefined ? String(s.region) : undefined,
|
|
1772
|
+
credentials: s?.credentials ? {
|
|
1773
|
+
accessKeyId: s.credentials.accessKeyId !== undefined ? String(s.credentials.accessKeyId) : undefined,
|
|
1774
|
+
secretAccessKey: s.credentials.secretAccessKey !== undefined ? String(s.credentials.secretAccessKey) : undefined,
|
|
1775
|
+
sessionToken: s.credentials.sessionToken !== undefined ? String(s.credentials.sessionToken) : undefined,
|
|
1776
|
+
roleArn: s.credentials.roleArn !== undefined ? String(s.credentials.roleArn) : undefined,
|
|
1777
|
+
} : undefined,
|
|
1778
|
+
})),
|
|
1779
|
+
merge_strategy,
|
|
1780
|
+
deduplication,
|
|
1781
|
+
});
|
|
1782
|
+
return {
|
|
1783
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1784
|
+
};
|
|
1785
|
+
}
|
|
1786
|
+
catch (error) {
|
|
1787
|
+
return {
|
|
1788
|
+
content: [{ type: "text", text: `ERROR: vesper.fuse failed: ${error.message}` }],
|
|
1789
|
+
isError: true,
|
|
1790
|
+
};
|
|
1791
|
+
}
|
|
1141
1792
|
}
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1793
|
+
case "vesper.extract_web": {
|
|
1794
|
+
hydrateExternalKeys();
|
|
1795
|
+
const url = String(request.params.arguments?.url || "").trim();
|
|
1796
|
+
const mode = request.params.arguments?.mode
|
|
1797
|
+
? String(request.params.arguments?.mode).trim().toLowerCase()
|
|
1798
|
+
: "auto";
|
|
1799
|
+
const schema = request.params.arguments?.schema && typeof request.params.arguments.schema === "object"
|
|
1800
|
+
? request.params.arguments.schema
|
|
1801
|
+
: undefined;
|
|
1802
|
+
if (!url) {
|
|
1803
|
+
return {
|
|
1804
|
+
content: [{ type: "text", text: "ERROR: vesper.extract_web requires 'url'." }],
|
|
1805
|
+
isError: true,
|
|
1806
|
+
};
|
|
1807
|
+
}
|
|
1808
|
+
try {
|
|
1809
|
+
const out = await webExtractorEngine.extract({
|
|
1810
|
+
url,
|
|
1811
|
+
mode: mode,
|
|
1812
|
+
strict_schema: request.params.arguments?.strict_schema !== false,
|
|
1813
|
+
schema: schema,
|
|
1814
|
+
});
|
|
1815
|
+
return {
|
|
1816
|
+
content: [{ type: "text", text: JSON.stringify(out, null, 2) }],
|
|
1817
|
+
};
|
|
1818
|
+
}
|
|
1819
|
+
catch (error) {
|
|
1820
|
+
return {
|
|
1821
|
+
content: [{ type: "text", text: `ERROR: vesper.extract_web failed: ${error.message}` }],
|
|
1822
|
+
isError: true,
|
|
1823
|
+
};
|
|
1824
|
+
}
|
|
1160
1825
|
}
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1826
|
+
case "unified_dataset_api": {
|
|
1827
|
+
hydrateExternalKeys();
|
|
1828
|
+
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
1829
|
+
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
1830
|
+
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
1831
|
+
const publicOnly = request.params.arguments?.public_only !== false;
|
|
1832
|
+
try {
|
|
1833
|
+
if (operation === "providers") {
|
|
1165
1834
|
return {
|
|
1166
|
-
content: [{ type: "text", text:
|
|
1167
|
-
isError: true,
|
|
1835
|
+
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
1168
1836
|
};
|
|
1169
1837
|
}
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1838
|
+
if (operation === "discover") {
|
|
1839
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1840
|
+
if (!query) {
|
|
1841
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
|
|
1842
|
+
}
|
|
1843
|
+
const result = await unifiedDatasetGateway.discover({
|
|
1844
|
+
query,
|
|
1845
|
+
source,
|
|
1846
|
+
limit: Number(request.params.arguments?.limit || 10),
|
|
1847
|
+
publicOnly,
|
|
1848
|
+
});
|
|
1178
1849
|
return {
|
|
1179
|
-
content: [{ type: "text", text:
|
|
1180
|
-
isError: true,
|
|
1850
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1181
1851
|
};
|
|
1182
1852
|
}
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1853
|
+
if (operation === "download") {
|
|
1854
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1855
|
+
if (!datasetId) {
|
|
1856
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
1857
|
+
}
|
|
1858
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1859
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1860
|
+
: request.params.arguments?.output_dir
|
|
1861
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1862
|
+
: "";
|
|
1863
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1864
|
+
try {
|
|
1865
|
+
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
1866
|
+
}
|
|
1867
|
+
catch {
|
|
1868
|
+
// best effort; non-HF providers do not require this
|
|
1869
|
+
}
|
|
1870
|
+
const result = await unifiedDatasetGateway.download({
|
|
1871
|
+
datasetId,
|
|
1872
|
+
source,
|
|
1873
|
+
targetDir,
|
|
1874
|
+
});
|
|
1875
|
+
try {
|
|
1876
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
1877
|
+
}
|
|
1878
|
+
catch (e) {
|
|
1879
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1880
|
+
}
|
|
1881
|
+
return {
|
|
1882
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1883
|
+
};
|
|
1203
1884
|
}
|
|
1204
|
-
|
|
1205
|
-
|
|
1885
|
+
if (operation === "info") {
|
|
1886
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1887
|
+
if (!datasetId) {
|
|
1888
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
|
|
1889
|
+
}
|
|
1890
|
+
const result = await unifiedDatasetGateway.info({
|
|
1891
|
+
datasetId,
|
|
1892
|
+
source,
|
|
1893
|
+
publicOnly,
|
|
1894
|
+
});
|
|
1895
|
+
return {
|
|
1896
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1897
|
+
};
|
|
1206
1898
|
}
|
|
1899
|
+
throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
|
|
1900
|
+
}
|
|
1901
|
+
catch (error) {
|
|
1902
|
+
return {
|
|
1903
|
+
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
1904
|
+
isError: true,
|
|
1905
|
+
};
|
|
1207
1906
|
}
|
|
1208
|
-
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
1209
|
-
return {
|
|
1210
|
-
content: [{ type: "text", text: formattedOutput }]
|
|
1211
|
-
};
|
|
1212
|
-
}
|
|
1213
|
-
catch (error) {
|
|
1214
|
-
return {
|
|
1215
|
-
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
1216
|
-
isError: true,
|
|
1217
|
-
};
|
|
1218
|
-
}
|
|
1219
|
-
}
|
|
1220
|
-
case "download_dataset": {
|
|
1221
|
-
hydrateExternalKeys();
|
|
1222
|
-
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1223
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1224
|
-
if (!datasetId) {
|
|
1225
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1226
|
-
}
|
|
1227
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1228
|
-
return {
|
|
1229
|
-
content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
|
|
1230
|
-
isError: true,
|
|
1231
|
-
};
|
|
1232
1907
|
}
|
|
1233
|
-
|
|
1908
|
+
case "vesper_search": {
|
|
1909
|
+
const query = String(request.params.arguments?.query);
|
|
1910
|
+
const limit = 5;
|
|
1911
|
+
const safeOnly = true; // Enable safe filter by default
|
|
1912
|
+
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
1913
|
+
if (!query) {
|
|
1914
|
+
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
1915
|
+
}
|
|
1916
|
+
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
1917
|
+
const formattedOutput = formatSearchResults(results);
|
|
1234
1918
|
return {
|
|
1235
|
-
content: [
|
|
1236
|
-
|
|
1919
|
+
content: [
|
|
1920
|
+
{
|
|
1921
|
+
type: "text",
|
|
1922
|
+
text: formattedOutput,
|
|
1923
|
+
},
|
|
1924
|
+
],
|
|
1237
1925
|
};
|
|
1238
1926
|
}
|
|
1239
|
-
|
|
1240
|
-
|
|
1927
|
+
case "discover_datasets": {
|
|
1928
|
+
hydrateExternalKeys();
|
|
1929
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1930
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1931
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
1932
|
+
if (!query) {
|
|
1933
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
1934
|
+
}
|
|
1241
1935
|
try {
|
|
1242
|
-
await
|
|
1243
|
-
|
|
1244
|
-
|
|
1936
|
+
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
1937
|
+
query,
|
|
1938
|
+
source,
|
|
1939
|
+
limit,
|
|
1940
|
+
publicOnly: false,
|
|
1941
|
+
});
|
|
1942
|
+
const results = gatewayResult.results;
|
|
1943
|
+
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1944
|
+
for (const ds of results.slice(0, limit)) {
|
|
1945
|
+
const info = {
|
|
1946
|
+
dataset_id: ds.id,
|
|
1947
|
+
id: ds.id,
|
|
1948
|
+
source: ds.source,
|
|
1949
|
+
repo_id: ds.id,
|
|
1950
|
+
total_images: ds.total_examples || 0,
|
|
1951
|
+
image_column: undefined,
|
|
1952
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1953
|
+
};
|
|
1954
|
+
try {
|
|
1955
|
+
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
1956
|
+
}
|
|
1957
|
+
catch {
|
|
1958
|
+
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
1959
|
+
}
|
|
1960
|
+
}
|
|
1961
|
+
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
1962
|
+
const noteBlock = gatewayResult.notes.length > 0
|
|
1963
|
+
? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
|
|
1964
|
+
: "";
|
|
1965
|
+
return {
|
|
1966
|
+
content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
|
|
1967
|
+
};
|
|
1245
1968
|
}
|
|
1246
|
-
catch {
|
|
1247
|
-
|
|
1969
|
+
catch (error) {
|
|
1970
|
+
return {
|
|
1971
|
+
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
1972
|
+
isError: true,
|
|
1973
|
+
};
|
|
1248
1974
|
}
|
|
1249
1975
|
}
|
|
1250
|
-
|
|
1251
|
-
|
|
1976
|
+
case "download_dataset": {
|
|
1977
|
+
hydrateExternalKeys();
|
|
1978
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1979
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1980
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1981
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1982
|
+
: request.params.arguments?.output_dir
|
|
1983
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1984
|
+
: "";
|
|
1985
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1986
|
+
if (!datasetId) {
|
|
1987
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1988
|
+
}
|
|
1989
|
+
// Pre-install Python datasets library for HuggingFace fallback
|
|
1990
|
+
if (source === "huggingface") {
|
|
1991
|
+
try {
|
|
1992
|
+
await ensurePythonModules([
|
|
1993
|
+
{ module: "datasets", packageName: "datasets" },
|
|
1994
|
+
]);
|
|
1995
|
+
}
|
|
1996
|
+
catch {
|
|
1997
|
+
// Continue - direct download may still work
|
|
1998
|
+
}
|
|
1999
|
+
}
|
|
1252
2000
|
try {
|
|
1253
|
-
|
|
2001
|
+
const result = await unifiedDatasetGateway.download({
|
|
2002
|
+
datasetId,
|
|
2003
|
+
source,
|
|
2004
|
+
targetDir,
|
|
2005
|
+
});
|
|
2006
|
+
try {
|
|
2007
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
2008
|
+
}
|
|
2009
|
+
catch (e) {
|
|
2010
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
2011
|
+
}
|
|
2012
|
+
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
2013
|
+
return {
|
|
2014
|
+
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
2015
|
+
};
|
|
1254
2016
|
}
|
|
1255
|
-
catch (
|
|
1256
|
-
|
|
2017
|
+
catch (error) {
|
|
2018
|
+
return {
|
|
2019
|
+
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
2020
|
+
isError: true,
|
|
2021
|
+
};
|
|
1257
2022
|
}
|
|
1258
|
-
return {
|
|
1259
|
-
content: [{ type: "text", text: `Download complete: ${localPath}` }]
|
|
1260
|
-
};
|
|
1261
2023
|
}
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
2024
|
+
case "vesper_download_assets": {
|
|
2025
|
+
hydrateExternalKeys();
|
|
2026
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
2027
|
+
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
2028
|
+
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
2029
|
+
const repoId = request.params.arguments?.repo_id
|
|
2030
|
+
? String(request.params.arguments.repo_id)
|
|
2031
|
+
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
2032
|
+
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
2033
|
+
const urls = Array.isArray(request.params.arguments?.urls)
|
|
2034
|
+
? (request.params.arguments?.urls).map(v => String(v))
|
|
2035
|
+
: undefined;
|
|
2036
|
+
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
2037
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2038
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2039
|
+
: request.params.arguments?.output_dir
|
|
2040
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2041
|
+
: undefined;
|
|
2042
|
+
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
2043
|
+
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
2044
|
+
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
2045
|
+
if (!datasetId || !source) {
|
|
2046
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
2047
|
+
}
|
|
2048
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
2049
|
+
return {
|
|
2050
|
+
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
2051
|
+
isError: true,
|
|
2052
|
+
};
|
|
2053
|
+
}
|
|
2054
|
+
const requiredModules = [
|
|
2055
|
+
{ module: "aiohttp", packageName: "aiohttp" },
|
|
2056
|
+
];
|
|
2057
|
+
if (source === "url") {
|
|
2058
|
+
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
2059
|
+
}
|
|
2060
|
+
if (source === "huggingface") {
|
|
2061
|
+
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
2062
|
+
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
2063
|
+
}
|
|
2064
|
+
if (source === "kaggle") {
|
|
2065
|
+
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
2066
|
+
}
|
|
2067
|
+
try {
|
|
2068
|
+
await ensurePythonModules(requiredModules);
|
|
2069
|
+
}
|
|
2070
|
+
catch (error) {
|
|
2071
|
+
return {
|
|
2072
|
+
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
2073
|
+
isError: true,
|
|
2074
|
+
};
|
|
2075
|
+
}
|
|
2076
|
+
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
2077
|
+
const payload = {
|
|
2078
|
+
dataset_id: datasetId,
|
|
2079
|
+
source,
|
|
2080
|
+
repo_id: repoId,
|
|
2081
|
+
kaggle_ref: kaggleRef,
|
|
2082
|
+
urls,
|
|
2083
|
+
output_format: outputFormat,
|
|
2084
|
+
output_dir: requestedOutputDir,
|
|
2085
|
+
max_items: maxItems,
|
|
2086
|
+
workers,
|
|
2087
|
+
image_column: imageColumn,
|
|
2088
|
+
output_root: requestedOutputDir || process.cwd(),
|
|
2089
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1266
2090
|
};
|
|
2091
|
+
try {
|
|
2092
|
+
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
2093
|
+
if (!result?.ok) {
|
|
2094
|
+
const errMsg = result?.error || "Unknown error";
|
|
2095
|
+
// Enhance error messages for common failures
|
|
2096
|
+
let hint = "";
|
|
2097
|
+
if (errMsg.includes("No image column")) {
|
|
2098
|
+
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
2099
|
+
}
|
|
2100
|
+
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
2101
|
+
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
2102
|
+
}
|
|
2103
|
+
return {
|
|
2104
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
|
|
2105
|
+
isError: true,
|
|
2106
|
+
};
|
|
2107
|
+
}
|
|
2108
|
+
return {
|
|
2109
|
+
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
2110
|
+
};
|
|
2111
|
+
}
|
|
2112
|
+
catch (error) {
|
|
2113
|
+
return {
|
|
2114
|
+
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
2115
|
+
isError: true,
|
|
2116
|
+
};
|
|
2117
|
+
}
|
|
1267
2118
|
}
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1279
|
-
? (request.params.arguments?.urls).map(v => String(v))
|
|
1280
|
-
: undefined;
|
|
1281
|
-
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
1282
|
-
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
1283
|
-
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1284
|
-
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1285
|
-
if (!datasetId || !source) {
|
|
1286
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1287
|
-
}
|
|
1288
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
2119
|
+
case "configure_kaggle": {
|
|
2120
|
+
const username = String(request.params.arguments?.username || "").trim();
|
|
2121
|
+
const key = String(request.params.arguments?.key || "").trim();
|
|
2122
|
+
if (!username || !key) {
|
|
2123
|
+
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
2124
|
+
}
|
|
2125
|
+
const r1 = secureKeys.set("kaggle_username", username);
|
|
2126
|
+
const r2 = secureKeys.set("kaggle_key", key);
|
|
2127
|
+
process.env.KAGGLE_USERNAME = username;
|
|
2128
|
+
process.env.KAGGLE_KEY = key;
|
|
1289
2129
|
return {
|
|
1290
|
-
content: [{ type: "text", text:
|
|
1291
|
-
isError: true,
|
|
2130
|
+
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1292
2131
|
};
|
|
1293
2132
|
}
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
2133
|
+
case "configure_keys": {
|
|
2134
|
+
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
2135
|
+
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
2136
|
+
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
2137
|
+
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
2138
|
+
const saved = [];
|
|
2139
|
+
const methods = [];
|
|
2140
|
+
if (hfToken) {
|
|
2141
|
+
const r = secureKeys.set("hf_token", hfToken);
|
|
2142
|
+
if (r.ok) {
|
|
2143
|
+
process.env.HF_TOKEN = hfToken;
|
|
2144
|
+
saved.push("HF token");
|
|
2145
|
+
if (r.method)
|
|
2146
|
+
methods.push(r.method);
|
|
2147
|
+
}
|
|
2148
|
+
}
|
|
2149
|
+
if (kaggleUsername) {
|
|
2150
|
+
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
2151
|
+
if (r.ok) {
|
|
2152
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
2153
|
+
saved.push("Kaggle username");
|
|
2154
|
+
if (r.method)
|
|
2155
|
+
methods.push(r.method);
|
|
2156
|
+
}
|
|
2157
|
+
}
|
|
2158
|
+
if (kaggleKey) {
|
|
2159
|
+
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
2160
|
+
if (r.ok) {
|
|
2161
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
2162
|
+
saved.push("Kaggle key");
|
|
2163
|
+
if (r.method)
|
|
2164
|
+
methods.push(r.method);
|
|
2165
|
+
}
|
|
2166
|
+
}
|
|
2167
|
+
if (dataworldToken) {
|
|
2168
|
+
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
2169
|
+
if (r.ok) {
|
|
2170
|
+
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
2171
|
+
saved.push("data.world token");
|
|
2172
|
+
if (r.method)
|
|
2173
|
+
methods.push(r.method);
|
|
2174
|
+
}
|
|
2175
|
+
}
|
|
2176
|
+
if (saved.length === 0) {
|
|
2177
|
+
return {
|
|
2178
|
+
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
2179
|
+
};
|
|
2180
|
+
}
|
|
1311
2181
|
return {
|
|
1312
|
-
content: [{ type: "text", text: `
|
|
1313
|
-
isError: true,
|
|
2182
|
+
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
1314
2183
|
};
|
|
1315
2184
|
}
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1332
|
-
if (!result?.ok) {
|
|
1333
|
-
const errMsg = result?.error || "Unknown error";
|
|
1334
|
-
// Enhance error messages for common failures
|
|
1335
|
-
let hint = "";
|
|
1336
|
-
if (errMsg.includes("No image column")) {
|
|
1337
|
-
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
1338
|
-
}
|
|
1339
|
-
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
1340
|
-
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
2185
|
+
case "get_dataset_info": {
|
|
2186
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2187
|
+
if (!datasetId) {
|
|
2188
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2189
|
+
}
|
|
2190
|
+
const dataset = metadataStore.getDataset(datasetId);
|
|
2191
|
+
if (!dataset) {
|
|
2192
|
+
// Fallback: check the registry for local path info
|
|
2193
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
2194
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
2195
|
+
if (regEntry) {
|
|
2196
|
+
const exists = regPath && fs.existsSync(regPath);
|
|
2197
|
+
return {
|
|
2198
|
+
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
2199
|
+
};
|
|
1341
2200
|
}
|
|
1342
2201
|
return {
|
|
1343
|
-
content: [{ type: "text", text: `ERROR:
|
|
2202
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
1344
2203
|
isError: true,
|
|
1345
2204
|
};
|
|
1346
2205
|
}
|
|
2206
|
+
// Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
|
|
2207
|
+
if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
|
|
2208
|
+
try {
|
|
2209
|
+
const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
|
|
2210
|
+
if (sizeResp.ok) {
|
|
2211
|
+
const sizeData = await sizeResp.json();
|
|
2212
|
+
const numRows = sizeData?.size?.dataset?.num_rows;
|
|
2213
|
+
if (numRows && numRows > 0) {
|
|
2214
|
+
dataset.total_examples = numRows;
|
|
2215
|
+
// Also backfill splits
|
|
2216
|
+
if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
|
|
2217
|
+
dataset.splits = sizeData.size.splits.map((s) => ({
|
|
2218
|
+
name: s.split,
|
|
2219
|
+
num_examples: s.num_rows || 0,
|
|
2220
|
+
size_bytes: s.num_bytes_parquet_files || 0,
|
|
2221
|
+
}));
|
|
2222
|
+
dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
|
|
2223
|
+
dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
|
|
2224
|
+
dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
|
|
2225
|
+
}
|
|
2226
|
+
// Persist enriched metadata
|
|
2227
|
+
metadataStore.saveDataset(dataset);
|
|
2228
|
+
}
|
|
2229
|
+
}
|
|
2230
|
+
}
|
|
2231
|
+
catch {
|
|
2232
|
+
// Enrichment is best-effort; continue with whatever we have
|
|
2233
|
+
}
|
|
2234
|
+
}
|
|
2235
|
+
const formattedOutput = formatDatasetInfo(dataset);
|
|
2236
|
+
return { content: [{ type: "text", text: formattedOutput }] };
|
|
2237
|
+
}
|
|
2238
|
+
case "analyze_quality": {
|
|
2239
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2240
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2241
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2242
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2243
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2244
|
+
// Demo Fallback for easy testing
|
|
2245
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2246
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2247
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2248
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
2249
|
+
filePath = demoParquetPath;
|
|
2250
|
+
}
|
|
2251
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
2252
|
+
filePath = demoCsvPath;
|
|
2253
|
+
}
|
|
2254
|
+
else if (datasetId !== "demo") {
|
|
2255
|
+
return {
|
|
2256
|
+
content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
|
|
2257
|
+
isError: true
|
|
2258
|
+
};
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
1347
2262
|
return {
|
|
1348
|
-
content: [{ type: "text", text: JSON.stringify(
|
|
2263
|
+
content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
|
|
1349
2264
|
};
|
|
1350
2265
|
}
|
|
1351
|
-
|
|
2266
|
+
case "preview_cleaning": {
|
|
2267
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2268
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2269
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2270
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2271
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2272
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2273
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2274
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2275
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
2276
|
+
filePath = demoParquetPath;
|
|
2277
|
+
}
|
|
2278
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
2279
|
+
filePath = demoCsvPath;
|
|
2280
|
+
}
|
|
2281
|
+
else {
|
|
2282
|
+
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
2283
|
+
}
|
|
2284
|
+
}
|
|
2285
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
2286
|
+
// Phase 1: Target Detection
|
|
2287
|
+
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
2288
|
+
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
2289
|
+
// OR let the planner handle it if we update its signature to accept filePath.
|
|
2290
|
+
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
2291
|
+
// We updated it to accept `targetInfo`.
|
|
2292
|
+
// So we need to run detection HERE and pass it.
|
|
2293
|
+
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
2294
|
+
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
2295
|
+
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
2296
|
+
// Quick fix: Instantiate local detector or make global.
|
|
2297
|
+
// I'll make a global `targetDetector` constant in index.ts
|
|
2298
|
+
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
2299
|
+
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
2300
|
+
// RETRY STRATEGY:
|
|
2301
|
+
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
2302
|
+
// 2. Run `detectTarget(filePath)`.
|
|
2303
|
+
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
2304
|
+
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
2305
|
+
// But since I'm in this tool, I can't look back.
|
|
2306
|
+
// I will assume I can add it, or just do it inside the case for now.
|
|
2307
|
+
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
2308
|
+
// Let's do that in a separate step if needed.
|
|
2309
|
+
// For now, I'll instantiate it here.
|
|
2310
|
+
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
2311
|
+
const detector = new TargetDetector(__dirname);
|
|
2312
|
+
const targetResult = await detector.detectTarget(filePath);
|
|
2313
|
+
const targetInfo = targetResult.target_column ? {
|
|
2314
|
+
target: targetResult.target_column,
|
|
2315
|
+
confidence: targetResult.confidence
|
|
2316
|
+
} : undefined;
|
|
2317
|
+
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
2318
|
+
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
2319
|
+
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
2320
|
+
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
2321
|
+
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
2322
|
+
}
|
|
2323
|
+
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
2324
|
+
if (plan.operations.length === 0) {
|
|
2325
|
+
explanation += "No cleaning operations required.";
|
|
2326
|
+
}
|
|
2327
|
+
else {
|
|
2328
|
+
plan.operations.forEach((op, i) => {
|
|
2329
|
+
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
2330
|
+
});
|
|
2331
|
+
}
|
|
1352
2332
|
return {
|
|
1353
|
-
content: [{ type: "text", text:
|
|
1354
|
-
isError: true,
|
|
2333
|
+
content: [{ type: "text", text: explanation }]
|
|
1355
2334
|
};
|
|
1356
2335
|
}
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
1363
|
-
}
|
|
1364
|
-
const r1 = secureKeys.set("kaggle_username", username);
|
|
1365
|
-
const r2 = secureKeys.set("kaggle_key", key);
|
|
1366
|
-
process.env.KAGGLE_USERNAME = username;
|
|
1367
|
-
process.env.KAGGLE_KEY = key;
|
|
1368
|
-
return {
|
|
1369
|
-
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1370
|
-
};
|
|
1371
|
-
}
|
|
1372
|
-
case "configure_keys": {
|
|
1373
|
-
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
1374
|
-
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
1375
|
-
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
1376
|
-
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
1377
|
-
const saved = [];
|
|
1378
|
-
const methods = [];
|
|
1379
|
-
if (hfToken) {
|
|
1380
|
-
const r = secureKeys.set("hf_token", hfToken);
|
|
1381
|
-
if (r.ok) {
|
|
1382
|
-
process.env.HF_TOKEN = hfToken;
|
|
1383
|
-
saved.push("HF token");
|
|
1384
|
-
if (r.method)
|
|
1385
|
-
methods.push(r.method);
|
|
2336
|
+
case "custom_clean": {
|
|
2337
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2338
|
+
const ops = request.params.arguments?.operations;
|
|
2339
|
+
if (!datasetId || datasetId === "undefined") {
|
|
2340
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1386
2341
|
}
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
1390
|
-
if (r.ok) {
|
|
1391
|
-
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1392
|
-
saved.push("Kaggle username");
|
|
1393
|
-
if (r.method)
|
|
1394
|
-
methods.push(r.method);
|
|
2342
|
+
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
2343
|
+
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1395
2344
|
}
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
const
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
2345
|
+
// Pre-check: verify dataset file exists before starting the job
|
|
2346
|
+
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
2347
|
+
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
2348
|
+
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
2349
|
+
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
2350
|
+
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
2351
|
+
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
2352
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
2353
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
2354
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
2355
|
+
fs.existsSync(datasetId);
|
|
2356
|
+
if (!cleanDataExists) {
|
|
2357
|
+
return {
|
|
2358
|
+
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
2359
|
+
isError: true,
|
|
2360
|
+
};
|
|
1404
2361
|
}
|
|
2362
|
+
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
2363
|
+
return {
|
|
2364
|
+
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
2365
|
+
};
|
|
1405
2366
|
}
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
2367
|
+
case "prepare_dataset": {
|
|
2368
|
+
hydrateExternalKeys();
|
|
2369
|
+
const query = String(request.params.arguments?.query);
|
|
2370
|
+
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
2371
|
+
const downloadImages = request.params.arguments?.download_images === true;
|
|
2372
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2373
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2374
|
+
: request.params.arguments?.output_dir
|
|
2375
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2376
|
+
: "";
|
|
2377
|
+
const outputDir = requestedOutputDir || process.cwd();
|
|
2378
|
+
if (!query || query === "undefined") {
|
|
2379
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1413
2380
|
}
|
|
1414
|
-
|
|
1415
|
-
if (saved.length === 0) {
|
|
2381
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
1416
2382
|
return {
|
|
1417
|
-
content: [{ type: "text", text:
|
|
2383
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1418
2384
|
};
|
|
1419
2385
|
}
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
const dataset = metadataStore.getDataset(datasetId);
|
|
1430
|
-
if (!dataset) {
|
|
2386
|
+
case "compare_datasets": {
|
|
2387
|
+
const datasetIds = request.params.arguments?.dataset_ids;
|
|
2388
|
+
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
2389
|
+
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
2390
|
+
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
2391
|
+
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
2392
|
+
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
2393
|
+
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
2394
|
+
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
1431
2395
|
return {
|
|
1432
|
-
content: [{ type: "text", text:
|
|
1433
|
-
isError: true,
|
|
2396
|
+
content: [{ type: "text", text: comparison }]
|
|
1434
2397
|
};
|
|
1435
2398
|
}
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1449
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
1450
|
-
filePath = demoParquetPath;
|
|
1451
|
-
}
|
|
1452
|
-
else if (fs.existsSync(demoCsvPath)) {
|
|
1453
|
-
filePath = demoCsvPath;
|
|
1454
|
-
}
|
|
1455
|
-
else if (datasetId !== "demo") {
|
|
2399
|
+
case "check_job_status": {
|
|
2400
|
+
const jobId = String(request.params.arguments?.job_id);
|
|
2401
|
+
const job = metadataStore.getJob(jobId);
|
|
2402
|
+
if (!job) {
|
|
2403
|
+
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
2404
|
+
}
|
|
2405
|
+
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
2406
|
+
const now = Date.now();
|
|
2407
|
+
const last = jobStatusLastPoll[jobId] || 0;
|
|
2408
|
+
const minPollMs = 3000;
|
|
2409
|
+
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
2410
|
+
const waitMs = minPollMs - (now - last);
|
|
1456
2411
|
return {
|
|
1457
|
-
content: [{ type: "text", text: `
|
|
1458
|
-
isError: true
|
|
2412
|
+
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
1459
2413
|
};
|
|
1460
2414
|
}
|
|
2415
|
+
jobStatusLastPoll[jobId] = now;
|
|
2416
|
+
return {
|
|
2417
|
+
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
2418
|
+
};
|
|
1461
2419
|
}
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
const
|
|
1475
|
-
const
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
2420
|
+
case "export_dataset": {
|
|
2421
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2422
|
+
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
2423
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2424
|
+
? String(request.params.arguments?.target_dir).trim()
|
|
2425
|
+
: request.params.arguments?.output_dir
|
|
2426
|
+
? String(request.params.arguments?.output_dir).trim()
|
|
2427
|
+
: "";
|
|
2428
|
+
const targetDir = path.resolve(requestedTargetDir || process.cwd());
|
|
2429
|
+
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2430
|
+
const fastMode = request.params.arguments?.fast === true;
|
|
2431
|
+
const preview = request.params.arguments?.preview === true;
|
|
2432
|
+
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
2433
|
+
const columns = request.params.arguments?.columns;
|
|
2434
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2435
|
+
// Use Metadata or Registry to find the actual local file
|
|
2436
|
+
const preferredLookupDirs = [targetDir, process.cwd()];
|
|
2437
|
+
let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2438
|
+
if (!sourcePath) {
|
|
2439
|
+
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2440
|
+
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2441
|
+
try {
|
|
2442
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
2443
|
+
}
|
|
2444
|
+
catch (e) {
|
|
2445
|
+
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
2446
|
+
}
|
|
2447
|
+
// Poll for download status or registry entry until local_path appears or timeout
|
|
2448
|
+
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
2449
|
+
const maxWait = 120_000; // 120s
|
|
2450
|
+
const interval = 2000;
|
|
2451
|
+
let waited = 0;
|
|
2452
|
+
while (waited < maxWait) {
|
|
2453
|
+
const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2454
|
+
if (resolved) {
|
|
2455
|
+
sourcePath = resolved;
|
|
2456
|
+
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
2457
|
+
break;
|
|
2458
|
+
}
|
|
2459
|
+
await wait(interval);
|
|
2460
|
+
waited += interval;
|
|
2461
|
+
}
|
|
2462
|
+
// If still no sourcePath, return helpful error listing prepared datasets
|
|
2463
|
+
if (!sourcePath) {
|
|
2464
|
+
const entries = readRegistry();
|
|
2465
|
+
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
2466
|
+
return {
|
|
2467
|
+
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
2468
|
+
isError: true
|
|
2469
|
+
};
|
|
2470
|
+
}
|
|
2471
|
+
}
|
|
2472
|
+
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
2473
|
+
try {
|
|
2474
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
2475
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2476
|
+
}
|
|
2477
|
+
}
|
|
2478
|
+
catch (e) {
|
|
2479
|
+
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
2480
|
+
}
|
|
2481
|
+
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
2482
|
+
if (!fastMode) {
|
|
2483
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
2484
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
2485
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
2486
|
+
if (!pipelineCompatibleInput) {
|
|
2487
|
+
console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
|
|
2488
|
+
}
|
|
2489
|
+
else if (currentExt !== pipelineFmt) {
|
|
2490
|
+
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
2491
|
+
try {
|
|
2492
|
+
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
|
|
2493
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2494
|
+
if (pipelineResult.final_output_path) {
|
|
2495
|
+
sourcePath = pipelineResult.final_output_path;
|
|
2496
|
+
try {
|
|
2497
|
+
// Update registry to point to pipeline's final output
|
|
2498
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
2499
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2500
|
+
}
|
|
2501
|
+
}
|
|
2502
|
+
catch (e) {
|
|
2503
|
+
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
2504
|
+
}
|
|
2505
|
+
}
|
|
2506
|
+
}
|
|
2507
|
+
catch (err) {
|
|
2508
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
2509
|
+
}
|
|
2510
|
+
}
|
|
1481
2511
|
}
|
|
1482
2512
|
else {
|
|
1483
|
-
|
|
2513
|
+
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
2514
|
+
}
|
|
2515
|
+
// Build export options
|
|
2516
|
+
const exportOpts = {};
|
|
2517
|
+
if (compression)
|
|
2518
|
+
exportOpts.compression = compression;
|
|
2519
|
+
if (preview)
|
|
2520
|
+
exportOpts.preview = true;
|
|
2521
|
+
if (sampleRows)
|
|
2522
|
+
exportOpts.sample_rows = sampleRows;
|
|
2523
|
+
if (columns)
|
|
2524
|
+
exportOpts.columns = columns;
|
|
2525
|
+
try {
|
|
2526
|
+
// Determine output file name
|
|
2527
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2528
|
+
const ext = extMap[requestedFormat] || ".feather";
|
|
2529
|
+
const safeName = getExportFileStem(datasetId);
|
|
2530
|
+
const outDir = targetDir;
|
|
2531
|
+
if (!fs.existsSync(outDir))
|
|
2532
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2533
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2534
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2535
|
+
// Build rich response
|
|
2536
|
+
let msg = `**Export complete**\n`;
|
|
2537
|
+
msg += `- **File**: ${result.output_path}\n`;
|
|
2538
|
+
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
2539
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2540
|
+
if (result.file_size_mb !== undefined)
|
|
2541
|
+
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
2542
|
+
if (result.elapsed_seconds !== undefined)
|
|
2543
|
+
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
2544
|
+
if (result.preview_path)
|
|
2545
|
+
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
2546
|
+
msg += `\n`;
|
|
2547
|
+
if (requestedFormat === "feather") {
|
|
2548
|
+
msg += `**Inspect with:**\n`;
|
|
2549
|
+
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
2550
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2551
|
+
}
|
|
2552
|
+
else if (requestedFormat === "parquet") {
|
|
2553
|
+
msg += `**Inspect with:**\n`;
|
|
2554
|
+
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
2555
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2556
|
+
}
|
|
2557
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2558
|
+
}
|
|
2559
|
+
catch (error) {
|
|
2560
|
+
return {
|
|
2561
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
2562
|
+
isError: true
|
|
2563
|
+
};
|
|
1484
2564
|
}
|
|
1485
2565
|
}
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
// I'll make a global `targetDetector` constant in index.ts
|
|
1499
|
-
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
1500
|
-
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
1501
|
-
// RETRY STRATEGY:
|
|
1502
|
-
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
1503
|
-
// 2. Run `detectTarget(filePath)`.
|
|
1504
|
-
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
1505
|
-
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
1506
|
-
// But since I'm in this tool, I can't look back.
|
|
1507
|
-
// I will assume I can add it, or just do it inside the case for now.
|
|
1508
|
-
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
1509
|
-
// Let's do that in a separate step if needed.
|
|
1510
|
-
// For now, I'll instantiate it here.
|
|
1511
|
-
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
1512
|
-
const detector = new TargetDetector(__dirname);
|
|
1513
|
-
const targetResult = await detector.detectTarget(filePath);
|
|
1514
|
-
const targetInfo = targetResult.target_column ? {
|
|
1515
|
-
target: targetResult.target_column,
|
|
1516
|
-
confidence: targetResult.confidence
|
|
1517
|
-
} : undefined;
|
|
1518
|
-
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
1519
|
-
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
1520
|
-
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
1521
|
-
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
1522
|
-
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
1523
|
-
}
|
|
1524
|
-
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
1525
|
-
if (plan.operations.length === 0) {
|
|
1526
|
-
explanation += "No cleaning operations required.";
|
|
1527
|
-
}
|
|
1528
|
-
else {
|
|
1529
|
-
plan.operations.forEach((op, i) => {
|
|
1530
|
-
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
2566
|
+
case "vesper_list_datasets": {
|
|
2567
|
+
const entries = readRegistry();
|
|
2568
|
+
if (entries.length === 0) {
|
|
2569
|
+
return {
|
|
2570
|
+
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
2571
|
+
};
|
|
2572
|
+
}
|
|
2573
|
+
const lines = entries.map((e, i) => {
|
|
2574
|
+
const id = e.dataset_id || e.id || "unknown";
|
|
2575
|
+
const localPath = e.local_path || e.path || "unknown";
|
|
2576
|
+
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
2577
|
+
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
1531
2578
|
});
|
|
1532
|
-
}
|
|
1533
|
-
return {
|
|
1534
|
-
content: [{ type: "text", text: explanation }]
|
|
1535
|
-
};
|
|
1536
|
-
}
|
|
1537
|
-
case "custom_clean": {
|
|
1538
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1539
|
-
const ops = request.params.arguments?.operations;
|
|
1540
|
-
if (!datasetId || datasetId === "undefined") {
|
|
1541
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1542
|
-
}
|
|
1543
|
-
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
1544
|
-
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1545
|
-
}
|
|
1546
|
-
// Pre-check: verify dataset file exists before starting the job
|
|
1547
|
-
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
1548
|
-
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
1549
|
-
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1550
|
-
const cleanSafeId = datasetId.replace(/\//g, "_");
|
|
1551
|
-
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
1552
|
-
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
1553
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
1554
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
1555
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
1556
|
-
fs.existsSync(datasetId);
|
|
1557
|
-
if (!cleanDataExists) {
|
|
1558
|
-
return {
|
|
1559
|
-
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
1560
|
-
isError: true,
|
|
1561
|
-
};
|
|
1562
|
-
}
|
|
1563
|
-
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
1564
|
-
return {
|
|
1565
|
-
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
1566
|
-
};
|
|
1567
|
-
}
|
|
1568
|
-
case "prepare_dataset": {
|
|
1569
|
-
hydrateExternalKeys();
|
|
1570
|
-
const query = String(request.params.arguments?.query);
|
|
1571
|
-
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1572
|
-
const downloadImages = request.params.arguments?.download_images === true;
|
|
1573
|
-
if (!query || query === "undefined") {
|
|
1574
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1575
|
-
}
|
|
1576
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1577
|
-
return {
|
|
1578
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1579
|
-
};
|
|
1580
|
-
}
|
|
1581
|
-
case "compare_datasets": {
|
|
1582
|
-
const datasetIds = request.params.arguments?.dataset_ids;
|
|
1583
|
-
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
1584
|
-
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
1585
|
-
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
1586
|
-
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
1587
|
-
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
1588
|
-
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
1589
|
-
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
1590
|
-
return {
|
|
1591
|
-
content: [{ type: "text", text: comparison }]
|
|
1592
|
-
};
|
|
1593
|
-
}
|
|
1594
|
-
case "check_job_status": {
|
|
1595
|
-
const jobId = String(request.params.arguments?.job_id);
|
|
1596
|
-
const job = metadataStore.getJob(jobId);
|
|
1597
|
-
if (!job) {
|
|
1598
|
-
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
1599
|
-
}
|
|
1600
|
-
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
1601
|
-
const now = Date.now();
|
|
1602
|
-
const last = jobStatusLastPoll[jobId] || 0;
|
|
1603
|
-
const minPollMs = 3000;
|
|
1604
|
-
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
1605
|
-
const waitMs = minPollMs - (now - last);
|
|
1606
2579
|
return {
|
|
1607
|
-
content: [{ type: "text", text:
|
|
2580
|
+
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
1608
2581
|
};
|
|
1609
2582
|
}
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
case "export_dataset": {
|
|
1616
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1617
|
-
const targetDir = request.params.arguments?.target_dir ? String(request.params.arguments?.target_dir) : undefined;
|
|
1618
|
-
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
1619
|
-
const fastMode = request.params.arguments?.fast === true;
|
|
1620
|
-
const preview = request.params.arguments?.preview === true;
|
|
1621
|
-
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
1622
|
-
const columns = request.params.arguments?.columns;
|
|
1623
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
1624
|
-
const dataset = metadataStore.getDataset(datasetId);
|
|
1625
|
-
if (!dataset) {
|
|
1626
|
-
throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
|
|
1627
|
-
}
|
|
1628
|
-
// Use Metadata or Registry to find the actual local file
|
|
1629
|
-
let sourcePath = undefined;
|
|
1630
|
-
const downloadStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1631
|
-
if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
|
|
1632
|
-
sourcePath = downloadStatus.local_path;
|
|
1633
|
-
}
|
|
1634
|
-
else {
|
|
1635
|
-
// Fallback to local registry
|
|
1636
|
-
const reg = getRegistryEntry(datasetId);
|
|
1637
|
-
if (reg && fs.existsSync(reg.local_path)) {
|
|
1638
|
-
sourcePath = reg.local_path;
|
|
2583
|
+
case "vesper_convert_format": {
|
|
2584
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2585
|
+
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
2586
|
+
if (!filePath) {
|
|
2587
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
1639
2588
|
}
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
2589
|
+
if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
|
|
2590
|
+
throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
|
|
2591
|
+
}
|
|
2592
|
+
if (!fs.existsSync(filePath)) {
|
|
2593
|
+
return {
|
|
2594
|
+
content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
|
|
2595
|
+
isError: true,
|
|
2596
|
+
};
|
|
2597
|
+
}
|
|
2598
|
+
const inputExt = path.extname(filePath).toLowerCase();
|
|
2599
|
+
const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
|
|
2600
|
+
const outputExt = extMap[targetFormat];
|
|
2601
|
+
if (inputExt === outputExt) {
|
|
2602
|
+
return {
|
|
2603
|
+
content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
|
|
2604
|
+
};
|
|
2605
|
+
}
|
|
2606
|
+
const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
|
|
1644
2607
|
try {
|
|
1645
|
-
|
|
2608
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2609
|
+
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
2610
|
+
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
2611
|
+
if (!result.ok) {
|
|
2612
|
+
return {
|
|
2613
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
2614
|
+
isError: true,
|
|
2615
|
+
};
|
|
2616
|
+
}
|
|
2617
|
+
// Register converted file in the registry
|
|
2618
|
+
const datasetId = path.basename(outputPath, outputExt);
|
|
2619
|
+
try {
|
|
2620
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
2621
|
+
}
|
|
2622
|
+
catch (e) {
|
|
2623
|
+
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2624
|
+
}
|
|
2625
|
+
let msg = `**Conversion complete**\n`;
|
|
2626
|
+
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2627
|
+
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
2628
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2629
|
+
if (result.size_mb !== undefined)
|
|
2630
|
+
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
2631
|
+
return { content: [{ type: "text", text: msg }] };
|
|
1646
2632
|
}
|
|
1647
|
-
catch (
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
|
|
1660
|
-
|
|
2633
|
+
catch (error) {
|
|
2634
|
+
return {
|
|
2635
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
|
|
2636
|
+
isError: true,
|
|
2637
|
+
};
|
|
2638
|
+
}
|
|
2639
|
+
}
|
|
2640
|
+
case "fuse_datasets": {
|
|
2641
|
+
const rawSources = request.params.arguments?.sources;
|
|
2642
|
+
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
2643
|
+
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
2644
|
+
}
|
|
2645
|
+
const strategy = request.params.arguments?.strategy || "concat";
|
|
2646
|
+
const joinOn = request.params.arguments?.join_on;
|
|
2647
|
+
const how = request.params.arguments?.how || "inner";
|
|
2648
|
+
const dedup = request.params.arguments?.dedup !== false;
|
|
2649
|
+
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
2650
|
+
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
2651
|
+
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
2652
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2653
|
+
const preview = request.params.arguments?.preview !== false;
|
|
2654
|
+
const resolvedPaths = [];
|
|
2655
|
+
const unresolved = [];
|
|
2656
|
+
for (const src of rawSources) {
|
|
2657
|
+
if (fs.existsSync(src)) {
|
|
2658
|
+
resolvedPaths.push(src);
|
|
2659
|
+
continue;
|
|
1661
2660
|
}
|
|
1662
|
-
const
|
|
1663
|
-
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
console.error(`[Export] Local data found in registry for ${datasetId}: ${sourcePath}`);
|
|
1667
|
-
break;
|
|
2661
|
+
const status = metadataStore.getDownloadStatus(src);
|
|
2662
|
+
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
2663
|
+
resolvedPaths.push(status.local_path);
|
|
2664
|
+
continue;
|
|
1668
2665
|
}
|
|
1669
|
-
|
|
1670
|
-
waited += interval;
|
|
2666
|
+
unresolved.push(src);
|
|
1671
2667
|
}
|
|
1672
|
-
|
|
1673
|
-
if (!sourcePath) {
|
|
1674
|
-
const entries = readRegistry();
|
|
1675
|
-
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
2668
|
+
if (unresolved.length > 0) {
|
|
1676
2669
|
return {
|
|
1677
|
-
content: [{
|
|
2670
|
+
content: [{
|
|
2671
|
+
type: "text",
|
|
2672
|
+
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
2673
|
+
}],
|
|
1678
2674
|
isError: true
|
|
1679
2675
|
};
|
|
1680
2676
|
}
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
console.error(`[
|
|
1689
|
-
|
|
1690
|
-
|
|
1691
|
-
|
|
2677
|
+
try {
|
|
2678
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
2679
|
+
const ext = extMap[outputFormat] || ".feather";
|
|
2680
|
+
const outDir = process.cwd();
|
|
2681
|
+
if (!fs.existsSync(outDir))
|
|
2682
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2683
|
+
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2684
|
+
console.error(`[Fusion] Resolved output directory: ${outDir}`);
|
|
2685
|
+
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2686
|
+
strategy,
|
|
2687
|
+
join_on: joinOn,
|
|
2688
|
+
how,
|
|
2689
|
+
dedup,
|
|
2690
|
+
run_quality_after: runQualityAfter,
|
|
2691
|
+
leakage_check: leakageCheck,
|
|
2692
|
+
output_format: outputFormat,
|
|
2693
|
+
compression: compression,
|
|
2694
|
+
preview,
|
|
2695
|
+
});
|
|
2696
|
+
const nullDelta = result.stats.null_delta;
|
|
2697
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
2698
|
+
// Register fused dataset under a generated id so users can export it easily
|
|
2699
|
+
const fusedId = `fused_${Date.now()}`;
|
|
1692
2700
|
try {
|
|
1693
|
-
|
|
1694
|
-
if (pipelineResult.final_output_path) {
|
|
1695
|
-
sourcePath = pipelineResult.final_output_path;
|
|
1696
|
-
try {
|
|
1697
|
-
// Update registry to point to pipeline's final output
|
|
1698
|
-
upsertRegistry(datasetId, sourcePath, "completed");
|
|
1699
|
-
}
|
|
1700
|
-
catch (e) {
|
|
1701
|
-
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
1702
|
-
}
|
|
1703
|
-
}
|
|
2701
|
+
upsertRegistry(fusedId, result.output_path, "completed");
|
|
1704
2702
|
}
|
|
1705
|
-
catch (
|
|
1706
|
-
console.error(`[
|
|
2703
|
+
catch (e) {
|
|
2704
|
+
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
1707
2705
|
}
|
|
2706
|
+
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
2707
|
+
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
2708
|
+
msg += `- Null change: ${nullText}\n`;
|
|
2709
|
+
msg += `- Output: ${result.output_path}\n`;
|
|
2710
|
+
if (result.preview_path)
|
|
2711
|
+
msg += `- Preview: ${result.preview_path}\n`;
|
|
2712
|
+
if (result.leakage_report) {
|
|
2713
|
+
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
2714
|
+
if (result.leakage_report.leakage_count) {
|
|
2715
|
+
msg += ` (${result.leakage_report.leakage_count})`;
|
|
2716
|
+
}
|
|
2717
|
+
msg += "\n";
|
|
2718
|
+
}
|
|
2719
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
2720
|
+
return { content: [{ type: "text", text: msg }] };
|
|
1708
2721
|
}
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
const exportOpts = {};
|
|
1715
|
-
if (compression)
|
|
1716
|
-
exportOpts.compression = compression;
|
|
1717
|
-
if (preview)
|
|
1718
|
-
exportOpts.preview = true;
|
|
1719
|
-
if (sampleRows)
|
|
1720
|
-
exportOpts.sample_rows = sampleRows;
|
|
1721
|
-
if (columns)
|
|
1722
|
-
exportOpts.columns = columns;
|
|
1723
|
-
try {
|
|
1724
|
-
// Determine output file name
|
|
1725
|
-
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
1726
|
-
const ext = extMap[requestedFormat] || ".feather";
|
|
1727
|
-
const safeName = datasetId.replace(/\//g, "_");
|
|
1728
|
-
const outDir = targetDir || path.join(dataRoot, "exports");
|
|
1729
|
-
if (!fs.existsSync(outDir))
|
|
1730
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
1731
|
-
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
1732
|
-
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
1733
|
-
// Build rich response
|
|
1734
|
-
let msg = `**Export complete**\n`;
|
|
1735
|
-
msg += `- **File**: ${result.output_path}\n`;
|
|
1736
|
-
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
1737
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
1738
|
-
if (result.file_size_mb !== undefined)
|
|
1739
|
-
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
1740
|
-
if (result.elapsed_seconds !== undefined)
|
|
1741
|
-
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
1742
|
-
if (result.preview_path)
|
|
1743
|
-
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
1744
|
-
msg += `\n`;
|
|
1745
|
-
if (requestedFormat === "feather") {
|
|
1746
|
-
msg += `**Inspect with:**\n`;
|
|
1747
|
-
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
1748
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1749
|
-
}
|
|
1750
|
-
else if (requestedFormat === "parquet") {
|
|
1751
|
-
msg += `**Inspect with:**\n`;
|
|
1752
|
-
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
1753
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1754
|
-
}
|
|
1755
|
-
return { content: [{ type: "text", text: msg }] };
|
|
1756
|
-
}
|
|
1757
|
-
catch (error) {
|
|
1758
|
-
return {
|
|
1759
|
-
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
1760
|
-
isError: true
|
|
1761
|
-
};
|
|
1762
|
-
}
|
|
1763
|
-
}
|
|
1764
|
-
case "fuse_datasets": {
|
|
1765
|
-
const rawSources = request.params.arguments?.sources;
|
|
1766
|
-
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
1767
|
-
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
1768
|
-
}
|
|
1769
|
-
const strategy = request.params.arguments?.strategy || "concat";
|
|
1770
|
-
const joinOn = request.params.arguments?.join_on;
|
|
1771
|
-
const how = request.params.arguments?.how || "inner";
|
|
1772
|
-
const dedup = request.params.arguments?.dedup !== false;
|
|
1773
|
-
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
1774
|
-
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
1775
|
-
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
1776
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
1777
|
-
const preview = request.params.arguments?.preview !== false;
|
|
1778
|
-
const resolvedPaths = [];
|
|
1779
|
-
const unresolved = [];
|
|
1780
|
-
for (const src of rawSources) {
|
|
1781
|
-
if (fs.existsSync(src)) {
|
|
1782
|
-
resolvedPaths.push(src);
|
|
1783
|
-
continue;
|
|
1784
|
-
}
|
|
1785
|
-
const status = metadataStore.getDownloadStatus(src);
|
|
1786
|
-
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
1787
|
-
resolvedPaths.push(status.local_path);
|
|
1788
|
-
continue;
|
|
2722
|
+
catch (error) {
|
|
2723
|
+
return {
|
|
2724
|
+
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
2725
|
+
isError: true
|
|
2726
|
+
};
|
|
1789
2727
|
}
|
|
1790
|
-
unresolved.push(src);
|
|
1791
|
-
}
|
|
1792
|
-
if (unresolved.length > 0) {
|
|
1793
|
-
return {
|
|
1794
|
-
content: [{
|
|
1795
|
-
type: "text",
|
|
1796
|
-
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
1797
|
-
}],
|
|
1798
|
-
isError: true
|
|
1799
|
-
};
|
|
1800
2728
|
}
|
|
1801
|
-
|
|
1802
|
-
const
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
if (!fs.existsSync(outDir))
|
|
1806
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
1807
|
-
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
1808
|
-
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
1809
|
-
strategy,
|
|
1810
|
-
join_on: joinOn,
|
|
1811
|
-
how,
|
|
1812
|
-
dedup,
|
|
1813
|
-
run_quality_after: runQualityAfter,
|
|
1814
|
-
leakage_check: leakageCheck,
|
|
1815
|
-
output_format: outputFormat,
|
|
1816
|
-
compression: compression,
|
|
1817
|
-
preview,
|
|
1818
|
-
});
|
|
1819
|
-
const nullDelta = result.stats.null_delta;
|
|
1820
|
-
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1821
|
-
// Register fused dataset under a generated id so users can export it easily
|
|
1822
|
-
const fusedId = `fused_${Date.now()}`;
|
|
1823
|
-
try {
|
|
1824
|
-
upsertRegistry(fusedId, result.output_path, "completed");
|
|
2729
|
+
case "analyze_image_quality": {
|
|
2730
|
+
const inputPath = String(request.params.arguments?.path);
|
|
2731
|
+
if (!fs.existsSync(inputPath)) {
|
|
2732
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
1825
2733
|
}
|
|
1826
|
-
|
|
1827
|
-
|
|
1828
|
-
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
2734
|
+
try {
|
|
2735
|
+
const report = await imageAnalyzer.analyze(inputPath);
|
|
2736
|
+
let output = `## Image Quality Report\n\n`;
|
|
2737
|
+
output += `- **Total Images**: ${report.total_images}\n`;
|
|
2738
|
+
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
2739
|
+
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
2740
|
+
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
2741
|
+
if (report.individual_results.length > 0) {
|
|
2742
|
+
output += `### Sample Detail (Top 5)\n`;
|
|
2743
|
+
report.individual_results.slice(0, 5).forEach(img => {
|
|
2744
|
+
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
2745
|
+
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
2746
|
+
});
|
|
1839
2747
|
}
|
|
1840
|
-
|
|
2748
|
+
return {
|
|
2749
|
+
content: [{ type: "text", text: output }]
|
|
2750
|
+
};
|
|
2751
|
+
}
|
|
2752
|
+
catch (error) {
|
|
2753
|
+
return {
|
|
2754
|
+
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
2755
|
+
isError: true
|
|
2756
|
+
};
|
|
1841
2757
|
}
|
|
1842
|
-
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
1843
|
-
return { content: [{ type: "text", text: msg }] };
|
|
1844
|
-
}
|
|
1845
|
-
catch (error) {
|
|
1846
|
-
return {
|
|
1847
|
-
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
1848
|
-
isError: true
|
|
1849
|
-
};
|
|
1850
|
-
}
|
|
1851
|
-
}
|
|
1852
|
-
case "analyze_image_quality": {
|
|
1853
|
-
const inputPath = String(request.params.arguments?.path);
|
|
1854
|
-
if (!fs.existsSync(inputPath)) {
|
|
1855
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
1856
2758
|
}
|
|
1857
|
-
|
|
1858
|
-
const
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
output +=
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
2759
|
+
case "analyze_media_quality": {
|
|
2760
|
+
const inputPath = String(request.params.arguments?.path);
|
|
2761
|
+
if (!fs.existsSync(inputPath)) {
|
|
2762
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2763
|
+
}
|
|
2764
|
+
try {
|
|
2765
|
+
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2766
|
+
let output = `## Media Quality Report\n\n`;
|
|
2767
|
+
output += `- **Total Files**: ${report.total_files}\n`;
|
|
2768
|
+
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
2769
|
+
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
2770
|
+
if ('avg_audio_duration' in report && report.avg_audio_duration) {
|
|
2771
|
+
output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
|
|
2772
|
+
}
|
|
2773
|
+
if ('avg_video_duration' in report && report.avg_video_duration) {
|
|
2774
|
+
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
2775
|
+
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
2776
|
+
}
|
|
2777
|
+
output += `\n### Sample Detail (Top 5)\n`;
|
|
2778
|
+
report.details.slice(0, 5).forEach(item => {
|
|
2779
|
+
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
2780
|
+
if (item.type === "audio" && 'sample_rate' in item) {
|
|
2781
|
+
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
2782
|
+
}
|
|
2783
|
+
else if (item.type === "video" && 'width' in item) {
|
|
2784
|
+
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
2785
|
+
}
|
|
2786
|
+
else {
|
|
2787
|
+
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
2788
|
+
}
|
|
1869
2789
|
});
|
|
2790
|
+
return {
|
|
2791
|
+
content: [{ type: "text", text: output }]
|
|
2792
|
+
};
|
|
2793
|
+
}
|
|
2794
|
+
catch (error) {
|
|
2795
|
+
return {
|
|
2796
|
+
content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
|
|
2797
|
+
isError: true
|
|
2798
|
+
};
|
|
1870
2799
|
}
|
|
1871
|
-
return {
|
|
1872
|
-
content: [{ type: "text", text: output }]
|
|
1873
|
-
};
|
|
1874
|
-
}
|
|
1875
|
-
catch (error) {
|
|
1876
|
-
return {
|
|
1877
|
-
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
1878
|
-
isError: true
|
|
1879
|
-
};
|
|
1880
|
-
}
|
|
1881
|
-
}
|
|
1882
|
-
case "analyze_media_quality": {
|
|
1883
|
-
const inputPath = String(request.params.arguments?.path);
|
|
1884
|
-
if (!fs.existsSync(inputPath)) {
|
|
1885
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
1886
2800
|
}
|
|
1887
|
-
|
|
1888
|
-
const
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1892
|
-
|
|
1893
|
-
|
|
1894
|
-
|
|
1895
|
-
|
|
1896
|
-
|
|
1897
|
-
|
|
1898
|
-
|
|
1899
|
-
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
|
|
1903
|
-
if (item.type === "audio" && 'sample_rate' in item) {
|
|
1904
|
-
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
2801
|
+
case "generate_quality_report": {
|
|
2802
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2803
|
+
const datasetPath = String(request.params.arguments?.dataset_path);
|
|
2804
|
+
if (!fs.existsSync(datasetPath)) {
|
|
2805
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
2806
|
+
}
|
|
2807
|
+
try {
|
|
2808
|
+
// Optionally load text quality from metadata if available
|
|
2809
|
+
const metadata = await metadataStore.getDataset(datasetId);
|
|
2810
|
+
// TODO: Integrate text quality analysis when available
|
|
2811
|
+
const textQuality = null;
|
|
2812
|
+
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
2813
|
+
// Save report to metadata
|
|
2814
|
+
if (metadata) {
|
|
2815
|
+
metadata.unified_quality_report = report;
|
|
2816
|
+
await metadataStore.saveDataset(metadata);
|
|
1905
2817
|
}
|
|
1906
|
-
|
|
1907
|
-
|
|
2818
|
+
let output = `# Unified Quality Report\n\n`;
|
|
2819
|
+
output += `**Dataset**: ${datasetId}\n`;
|
|
2820
|
+
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
2821
|
+
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
2822
|
+
if (report.text_quality) {
|
|
2823
|
+
output += `## Text Quality\n`;
|
|
2824
|
+
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
2825
|
+
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
2826
|
+
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
2827
|
+
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
1908
2828
|
}
|
|
1909
|
-
|
|
1910
|
-
output +=
|
|
2829
|
+
if (report.image_quality) {
|
|
2830
|
+
output += `## Image Quality\n`;
|
|
2831
|
+
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
2832
|
+
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
2833
|
+
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
2834
|
+
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
1911
2835
|
}
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
}
|
|
1941
|
-
let output = `# Unified Quality Report\n\n`;
|
|
1942
|
-
output += `**Dataset**: ${datasetId}\n`;
|
|
1943
|
-
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
1944
|
-
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
1945
|
-
if (report.text_quality) {
|
|
1946
|
-
output += `## Text Quality\n`;
|
|
1947
|
-
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
1948
|
-
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
1949
|
-
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
1950
|
-
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
1951
|
-
}
|
|
1952
|
-
if (report.image_quality) {
|
|
1953
|
-
output += `## Image Quality\n`;
|
|
1954
|
-
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
1955
|
-
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
1956
|
-
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
1957
|
-
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
1958
|
-
}
|
|
1959
|
-
if (report.audio_quality) {
|
|
1960
|
-
output += `## Audio Quality\n`;
|
|
1961
|
-
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
1962
|
-
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
1963
|
-
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
1964
|
-
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
1965
|
-
}
|
|
1966
|
-
if (report.video_quality) {
|
|
1967
|
-
output += `## Video Quality\n`;
|
|
1968
|
-
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
1969
|
-
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
1970
|
-
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
1971
|
-
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
1972
|
-
}
|
|
1973
|
-
output += `## Recommendations\n`;
|
|
1974
|
-
report.recommendations.forEach(rec => {
|
|
1975
|
-
output += `- ${rec}\n`;
|
|
1976
|
-
});
|
|
1977
|
-
return {
|
|
1978
|
-
content: [{ type: "text", text: output }]
|
|
1979
|
-
};
|
|
1980
|
-
}
|
|
1981
|
-
catch (error) {
|
|
1982
|
-
return {
|
|
1983
|
-
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
1984
|
-
isError: true
|
|
1985
|
-
};
|
|
2836
|
+
if (report.audio_quality) {
|
|
2837
|
+
output += `## Audio Quality\n`;
|
|
2838
|
+
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
2839
|
+
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
2840
|
+
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
2841
|
+
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
2842
|
+
}
|
|
2843
|
+
if (report.video_quality) {
|
|
2844
|
+
output += `## Video Quality\n`;
|
|
2845
|
+
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
2846
|
+
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
2847
|
+
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
2848
|
+
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
2849
|
+
}
|
|
2850
|
+
output += `## Recommendations\n`;
|
|
2851
|
+
report.recommendations.forEach(rec => {
|
|
2852
|
+
output += `- ${rec}\n`;
|
|
2853
|
+
});
|
|
2854
|
+
return {
|
|
2855
|
+
content: [{ type: "text", text: output }]
|
|
2856
|
+
};
|
|
2857
|
+
}
|
|
2858
|
+
catch (error) {
|
|
2859
|
+
return {
|
|
2860
|
+
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
2861
|
+
isError: true
|
|
2862
|
+
};
|
|
2863
|
+
}
|
|
1986
2864
|
}
|
|
2865
|
+
default:
|
|
2866
|
+
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
1987
2867
|
}
|
|
1988
|
-
|
|
1989
|
-
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
1990
|
-
}
|
|
2868
|
+
}); // end requestQueue.enqueue
|
|
1991
2869
|
});
|
|
1992
2870
|
async function main() {
|
|
1993
2871
|
const args = process.argv.slice(2);
|
|
@@ -1995,6 +2873,7 @@ async function main() {
|
|
|
1995
2873
|
const isFuse = args.includes("fuse");
|
|
1996
2874
|
const isDiscover = args.includes("discover");
|
|
1997
2875
|
const isDownload = args.includes("download");
|
|
2876
|
+
const isExport = args.includes("export");
|
|
1998
2877
|
const isConfig = args.includes("config") || args.includes("configure");
|
|
1999
2878
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
2000
2879
|
const isSilent = args.includes("--silent");
|
|
@@ -2017,6 +2896,10 @@ async function main() {
|
|
|
2017
2896
|
await runDownloadCli(args);
|
|
2018
2897
|
return;
|
|
2019
2898
|
}
|
|
2899
|
+
if (isExport) {
|
|
2900
|
+
await runExportCli(args);
|
|
2901
|
+
return;
|
|
2902
|
+
}
|
|
2020
2903
|
// If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
|
|
2021
2904
|
if (isSetup) {
|
|
2022
2905
|
await runSetupWizard(isSilent);
|
|
@@ -2289,6 +3172,99 @@ async function runDownloadCli(args) {
|
|
|
2289
3172
|
}
|
|
2290
3173
|
console.log(`Download complete: ${localPath}`);
|
|
2291
3174
|
}
|
|
3175
|
+
async function runExportCli(args) {
|
|
3176
|
+
const getArgValue = (name) => {
|
|
3177
|
+
const idx = args.findIndex(a => a === name);
|
|
3178
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
3179
|
+
return args[idx + 1];
|
|
3180
|
+
return undefined;
|
|
3181
|
+
};
|
|
3182
|
+
const nonFlags = args.filter((arg, index) => {
|
|
3183
|
+
if (arg.startsWith("--"))
|
|
3184
|
+
return false;
|
|
3185
|
+
const previous = index > 0 ? args[index - 1] : "";
|
|
3186
|
+
if (["--target-dir", "--format", "--compression", "--sample-rows", "--columns"].includes(previous))
|
|
3187
|
+
return false;
|
|
3188
|
+
return true;
|
|
3189
|
+
});
|
|
3190
|
+
const datasetId = nonFlags[1] || "";
|
|
3191
|
+
if (!datasetId) {
|
|
3192
|
+
console.error("Usage: vespermcp export <dataset-id|local-path> [--format parquet|feather|csv|jsonl|arrow] [--target-dir C:/path] [--compression snappy] [--fast] [--preview] [--sample-rows N] [--columns col1,col2]");
|
|
3193
|
+
process.exit(1);
|
|
3194
|
+
}
|
|
3195
|
+
const requestedFormat = getArgValue("--format") || "parquet";
|
|
3196
|
+
const targetDir = getArgValue("--target-dir");
|
|
3197
|
+
const compression = getArgValue("--compression");
|
|
3198
|
+
const sampleRows = getArgValue("--sample-rows");
|
|
3199
|
+
const columns = getArgValue("--columns");
|
|
3200
|
+
const fastMode = args.includes("--fast");
|
|
3201
|
+
const preview = args.includes("--preview");
|
|
3202
|
+
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
3203
|
+
const resolvedTargetDir = path.resolve(targetDir || process.cwd());
|
|
3204
|
+
let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
|
|
3205
|
+
if (!sourcePath) {
|
|
3206
|
+
console.error(`Export failed: no local data found for ${datasetId}. Run download or prepare first, or pass a direct local path.`);
|
|
3207
|
+
process.exit(1);
|
|
3208
|
+
}
|
|
3209
|
+
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
3210
|
+
try {
|
|
3211
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
3212
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
3213
|
+
}
|
|
3214
|
+
}
|
|
3215
|
+
catch (e) {
|
|
3216
|
+
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
3217
|
+
}
|
|
3218
|
+
if (!fastMode) {
|
|
3219
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
3220
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
3221
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
3222
|
+
if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
|
|
3223
|
+
try {
|
|
3224
|
+
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
|
|
3225
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
3226
|
+
if (pipelineResult.final_output_path) {
|
|
3227
|
+
sourcePath = pipelineResult.final_output_path;
|
|
3228
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
3229
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
3230
|
+
}
|
|
3231
|
+
}
|
|
3232
|
+
}
|
|
3233
|
+
catch (err) {
|
|
3234
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
3235
|
+
}
|
|
3236
|
+
}
|
|
3237
|
+
}
|
|
3238
|
+
const exportOpts = {};
|
|
3239
|
+
if (compression)
|
|
3240
|
+
exportOpts.compression = compression;
|
|
3241
|
+
if (preview)
|
|
3242
|
+
exportOpts.preview = true;
|
|
3243
|
+
if (sampleRows)
|
|
3244
|
+
exportOpts.sample_rows = Number(sampleRows);
|
|
3245
|
+
if (columns)
|
|
3246
|
+
exportOpts.columns = columns.split(",").map(col => col.trim()).filter(Boolean);
|
|
3247
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
3248
|
+
const ext = extMap[requestedFormat] || ".parquet";
|
|
3249
|
+
const safeName = getExportFileStem(datasetId);
|
|
3250
|
+
const outDir = resolvedTargetDir;
|
|
3251
|
+
if (!fs.existsSync(outDir))
|
|
3252
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
3253
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
3254
|
+
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
3255
|
+
console.error(`[Export] Output file: ${outputFile}`);
|
|
3256
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
3257
|
+
console.log(`Export complete: ${result.output_path}`);
|
|
3258
|
+
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
3259
|
+
if (result.rows !== undefined)
|
|
3260
|
+
console.log(`Rows: ${result.rows.toLocaleString()}`);
|
|
3261
|
+
if (result.columns !== undefined)
|
|
3262
|
+
console.log(`Columns: ${result.columns}`);
|
|
3263
|
+
if (result.file_size_mb !== undefined)
|
|
3264
|
+
console.log(`Size: ${result.file_size_mb} MB`);
|
|
3265
|
+
if (result.preview_path)
|
|
3266
|
+
console.log(`Preview: ${result.preview_path}`);
|
|
3267
|
+
}
|
|
2292
3268
|
async function runFuseCli(args) {
|
|
2293
3269
|
const getArgValue = (name) => {
|
|
2294
3270
|
const idx = args.findIndex(a => a === name);
|