@vespermcp/mcp-server 1.2.20 → 1.2.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +410 -0
- package/build/index.js +1592 -837
- package/build/ingestion/hf-downloader.js +12 -2
- package/build/ingestion/ingestor.js +19 -9
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/scraper.js +85 -14
- package/build/python/asset_downloader_engine.py +22 -1
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/hf_fallback.py +196 -45
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +238 -48
- package/build/search/engine.js +43 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/package.json +7 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +601 -0
- package/scripts/wizard.js +306 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +22 -1
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/hf_fallback.py +196 -45
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +238 -48
- package/wizard.cjs +3 -0
package/build/index.js
CHANGED
|
@@ -1,12 +1,39 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
// --- Dataset ID Normalization ---
|
|
3
3
|
function normalize_dataset_id(dataset_id) {
|
|
4
|
-
|
|
5
|
-
|
|
4
|
+
const trimmed = dataset_id.trim();
|
|
5
|
+
const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
|
|
6
|
+
let id = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
|
|
6
7
|
// Replace / and : with _ for filesystem safety
|
|
7
|
-
id = id.replace(/[
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
id = id.replace(/[\\/:]/g, "_");
|
|
9
|
+
if (!sourceMatch) {
|
|
10
|
+
return id;
|
|
11
|
+
}
|
|
12
|
+
const source = sourceMatch[1].toLowerCase() === "hf" ? "huggingface" : sourceMatch[1].toLowerCase();
|
|
13
|
+
return `${source}_${id}`;
|
|
14
|
+
}
|
|
15
|
+
function getDatasetIdAliases(dataset_id) {
|
|
16
|
+
const trimmed = dataset_id.trim();
|
|
17
|
+
const aliases = new Set([trimmed]);
|
|
18
|
+
const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
|
|
19
|
+
if (sourceMatch) {
|
|
20
|
+
const stripped = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
|
|
21
|
+
aliases.add(stripped);
|
|
22
|
+
if (sourceMatch[1].toLowerCase() === "hf") {
|
|
23
|
+
aliases.add(`huggingface:${stripped}`);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
aliases.add(`kaggle:${trimmed}`);
|
|
28
|
+
aliases.add(`huggingface:${trimmed}`);
|
|
29
|
+
aliases.add(`hf:${trimmed}`);
|
|
30
|
+
aliases.add(`openml:${trimmed}`);
|
|
31
|
+
aliases.add(`dataworld:${trimmed}`);
|
|
32
|
+
}
|
|
33
|
+
return Array.from(aliases);
|
|
34
|
+
}
|
|
35
|
+
function toSafeDatasetPathFragment(dataset_id) {
|
|
36
|
+
return normalize_dataset_id(dataset_id);
|
|
10
37
|
}
|
|
11
38
|
// --- Dataset Registry Helpers ---
|
|
12
39
|
function getRegistryPath() {
|
|
@@ -29,10 +56,11 @@ function writeRegistry(entries) {
|
|
|
29
56
|
fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
|
|
30
57
|
}
|
|
31
58
|
function upsertRegistry(dataset_id, local_path, status) {
|
|
32
|
-
const
|
|
59
|
+
const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
|
|
60
|
+
const norm_id = aliases[0];
|
|
33
61
|
console.error(`[Registry] Writing key: ${norm_id}`);
|
|
34
62
|
const entries = readRegistry();
|
|
35
|
-
const idx = entries.findIndex(e => e.dataset_id
|
|
63
|
+
const idx = entries.findIndex(e => aliases.includes(e.dataset_id || e.id));
|
|
36
64
|
if (idx >= 0) {
|
|
37
65
|
entries[idx] = { dataset_id: norm_id, local_path, status };
|
|
38
66
|
}
|
|
@@ -42,9 +70,163 @@ function upsertRegistry(dataset_id, local_path, status) {
|
|
|
42
70
|
writeRegistry(entries);
|
|
43
71
|
}
|
|
44
72
|
function getRegistryEntry(dataset_id) {
|
|
45
|
-
const
|
|
46
|
-
console.error(`[Registry] Lookup
|
|
47
|
-
return readRegistry().find(e => (e.dataset_id || e.id)
|
|
73
|
+
const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
|
|
74
|
+
console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
|
|
75
|
+
return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
|
|
76
|
+
}
|
|
77
|
+
const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
|
|
78
|
+
const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
|
|
79
|
+
function walkFilesRecursive(rootDir) {
|
|
80
|
+
const out = [];
|
|
81
|
+
const stack = [rootDir];
|
|
82
|
+
while (stack.length > 0) {
|
|
83
|
+
const currentDir = stack.pop();
|
|
84
|
+
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
|
|
85
|
+
for (const entry of entries) {
|
|
86
|
+
const fullPath = path.join(currentDir, entry.name);
|
|
87
|
+
if (entry.isDirectory()) {
|
|
88
|
+
stack.push(fullPath);
|
|
89
|
+
}
|
|
90
|
+
else if (entry.isFile()) {
|
|
91
|
+
out.push(fullPath);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
out.sort();
|
|
96
|
+
return out;
|
|
97
|
+
}
|
|
98
|
+
function inferImageManifestRecord(rootDir, fullPath, index) {
|
|
99
|
+
const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, "/");
|
|
100
|
+
const parentDir = path.posix.dirname(relativePath);
|
|
101
|
+
const parts = parentDir.split("/").filter(part => part && part !== ".");
|
|
102
|
+
let split;
|
|
103
|
+
let label;
|
|
104
|
+
if (parts.length > 0) {
|
|
105
|
+
const first = parts[0].toLowerCase();
|
|
106
|
+
if (["train", "test", "val", "valid", "validation"].includes(first)) {
|
|
107
|
+
split = parts[0];
|
|
108
|
+
if (parts.length > 1) {
|
|
109
|
+
label = parts[parts.length - 1];
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
label = parts[parts.length - 1];
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return {
|
|
117
|
+
id: index,
|
|
118
|
+
image_path: path.resolve(fullPath),
|
|
119
|
+
relative_path: relativePath,
|
|
120
|
+
file_name: path.basename(fullPath),
|
|
121
|
+
extension: path.extname(fullPath).toLowerCase().replace(/^\./, ""),
|
|
122
|
+
...(split ? { split } : {}),
|
|
123
|
+
...(label ? { label } : {}),
|
|
124
|
+
};
|
|
125
|
+
}
|
|
126
|
+
function createImageManifestFromDirectory(rootDir) {
|
|
127
|
+
const imageFiles = walkFilesRecursive(rootDir).filter(filePath => IMAGE_FILE_EXTENSIONS.has(path.extname(filePath).toLowerCase()));
|
|
128
|
+
if (imageFiles.length === 0) {
|
|
129
|
+
throw new Error(`No image files found under ${rootDir}`);
|
|
130
|
+
}
|
|
131
|
+
const manifestPath = path.join(rootDir, "_vesper_image_manifest.jsonl");
|
|
132
|
+
const lines = imageFiles.map((filePath, index) => JSON.stringify(inferImageManifestRecord(rootDir, filePath, index)));
|
|
133
|
+
fs.writeFileSync(manifestPath, `${lines.join("\n")}\n`, "utf-8");
|
|
134
|
+
return manifestPath;
|
|
135
|
+
}
|
|
136
|
+
function ensureExportableLocalPath(localPath) {
|
|
137
|
+
if (!fs.existsSync(localPath)) {
|
|
138
|
+
throw new Error(`Local path not found: ${localPath}`);
|
|
139
|
+
}
|
|
140
|
+
const stats = fs.statSync(localPath);
|
|
141
|
+
if (stats.isFile()) {
|
|
142
|
+
return localPath;
|
|
143
|
+
}
|
|
144
|
+
const manifestPath = path.join(localPath, "_vesper_image_manifest.jsonl");
|
|
145
|
+
if (fs.existsSync(manifestPath)) {
|
|
146
|
+
return manifestPath;
|
|
147
|
+
}
|
|
148
|
+
const candidates = walkFilesRecursive(localPath);
|
|
149
|
+
for (const ext of STRUCTURED_FILE_EXTENSIONS) {
|
|
150
|
+
const match = candidates.find(candidate => path.extname(candidate).toLowerCase() === ext);
|
|
151
|
+
if (match) {
|
|
152
|
+
return match;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return createImageManifestFromDirectory(localPath);
|
|
156
|
+
}
|
|
157
|
+
function isPathWithinDirectory(candidatePath, directoryPath) {
|
|
158
|
+
const relativePath = path.relative(path.resolve(directoryPath), path.resolve(candidatePath));
|
|
159
|
+
return relativePath === "" || (!relativePath.startsWith("..") && !path.isAbsolute(relativePath));
|
|
160
|
+
}
|
|
161
|
+
function buildDatasetCandidatePaths(baseDir, safeId) {
|
|
162
|
+
return [
|
|
163
|
+
path.join(baseDir, `${safeId}.parquet`),
|
|
164
|
+
path.join(baseDir, `${safeId}.csv`),
|
|
165
|
+
path.join(baseDir, `${safeId}.jsonl`),
|
|
166
|
+
path.join(baseDir, `${safeId}.json`),
|
|
167
|
+
path.join(baseDir, `${safeId}.feather`),
|
|
168
|
+
path.join(baseDir, `${safeId}.arrow`),
|
|
169
|
+
path.join(baseDir, safeId),
|
|
170
|
+
];
|
|
171
|
+
}
|
|
172
|
+
function shouldTrackExportPath(localPath) {
|
|
173
|
+
return isPathWithinDirectory(localPath, dataRoot);
|
|
174
|
+
}
|
|
175
|
+
function isDirectLocalDatasetReference(datasetIdOrPath) {
|
|
176
|
+
return fs.existsSync(datasetIdOrPath);
|
|
177
|
+
}
|
|
178
|
+
function getExportFileStem(datasetIdOrPath) {
|
|
179
|
+
if (isDirectLocalDatasetReference(datasetIdOrPath)) {
|
|
180
|
+
const resolvedPath = path.resolve(datasetIdOrPath);
|
|
181
|
+
const stats = fs.statSync(resolvedPath);
|
|
182
|
+
const baseName = stats.isDirectory()
|
|
183
|
+
? path.basename(resolvedPath)
|
|
184
|
+
: path.parse(resolvedPath).name;
|
|
185
|
+
return toSafeDatasetPathFragment(baseName);
|
|
186
|
+
}
|
|
187
|
+
return toSafeDatasetPathFragment(datasetIdOrPath);
|
|
188
|
+
}
|
|
189
|
+
function ensureLocalPipelineSource(sourcePath, datasetId, targetDir) {
|
|
190
|
+
const resolvedTargetDir = path.resolve(targetDir);
|
|
191
|
+
const resolvedSourcePath = path.resolve(sourcePath);
|
|
192
|
+
if (path.dirname(resolvedSourcePath) === resolvedTargetDir) {
|
|
193
|
+
return resolvedSourcePath;
|
|
194
|
+
}
|
|
195
|
+
if (!fs.existsSync(resolvedTargetDir)) {
|
|
196
|
+
fs.mkdirSync(resolvedTargetDir, { recursive: true });
|
|
197
|
+
}
|
|
198
|
+
const stagedPath = path.join(resolvedTargetDir, `${toSafeDatasetPathFragment(datasetId)}${path.extname(resolvedSourcePath)}`);
|
|
199
|
+
if (resolvedSourcePath !== stagedPath) {
|
|
200
|
+
fs.copyFileSync(resolvedSourcePath, stagedPath);
|
|
201
|
+
}
|
|
202
|
+
return stagedPath;
|
|
203
|
+
}
|
|
204
|
+
function resolveDatasetLocalPath(datasetIdOrPath, preferredDirs = []) {
|
|
205
|
+
if (fs.existsSync(datasetIdOrPath)) {
|
|
206
|
+
return ensureExportableLocalPath(datasetIdOrPath);
|
|
207
|
+
}
|
|
208
|
+
const safeId = toSafeDatasetPathFragment(datasetIdOrPath);
|
|
209
|
+
const uniquePreferredDirs = Array.from(new Set(preferredDirs
|
|
210
|
+
.filter((dir) => typeof dir === "string" && dir.trim().length > 0)
|
|
211
|
+
.map(dir => path.resolve(dir))));
|
|
212
|
+
for (const preferredDir of uniquePreferredDirs) {
|
|
213
|
+
const localMatch = buildDatasetCandidatePaths(preferredDir, safeId).find(candidate => fs.existsSync(candidate));
|
|
214
|
+
if (localMatch) {
|
|
215
|
+
return ensureExportableLocalPath(localMatch);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
const downloadStatus = metadataStore.getDownloadStatus(datasetIdOrPath);
|
|
219
|
+
if (downloadStatus?.local_path && fs.existsSync(downloadStatus.local_path)) {
|
|
220
|
+
return ensureExportableLocalPath(downloadStatus.local_path);
|
|
221
|
+
}
|
|
222
|
+
const reg = getRegistryEntry(datasetIdOrPath);
|
|
223
|
+
const regPath = reg?.local_path || reg?.path;
|
|
224
|
+
if (regPath && fs.existsSync(regPath)) {
|
|
225
|
+
return ensureExportableLocalPath(regPath);
|
|
226
|
+
}
|
|
227
|
+
const rawCandidates = buildDatasetCandidatePaths(path.join(dataRoot, "data", "raw"), safeId);
|
|
228
|
+
const match = rawCandidates.find(candidate => fs.existsSync(candidate));
|
|
229
|
+
return match ? ensureExportableLocalPath(match) : undefined;
|
|
48
230
|
}
|
|
49
231
|
// --- Pipeline State Tracker ---
|
|
50
232
|
// Tracks completed steps per session/job/dataset
|
|
@@ -88,6 +270,7 @@ import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
|
88
270
|
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
89
271
|
import { OpenMLSource } from "./metadata/openml-source.js";
|
|
90
272
|
import { DataWorldSource } from "./metadata/dataworld-source.js";
|
|
273
|
+
import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
|
|
91
274
|
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
92
275
|
import { JobManager } from "./jobs/manager.js";
|
|
93
276
|
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
@@ -131,6 +314,34 @@ function logError(err, context) {
|
|
|
131
314
|
fs.appendFileSync(errorLogPath, msg);
|
|
132
315
|
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
133
316
|
}
|
|
317
|
+
// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
|
|
318
|
+
class RequestQueue {
|
|
319
|
+
queue = [];
|
|
320
|
+
running = false;
|
|
321
|
+
enqueue(task) {
|
|
322
|
+
return new Promise((resolve, reject) => {
|
|
323
|
+
this.queue.push({ resolve, reject, task });
|
|
324
|
+
this.drain();
|
|
325
|
+
});
|
|
326
|
+
}
|
|
327
|
+
async drain() {
|
|
328
|
+
if (this.running)
|
|
329
|
+
return;
|
|
330
|
+
this.running = true;
|
|
331
|
+
while (this.queue.length > 0) {
|
|
332
|
+
const item = this.queue.shift();
|
|
333
|
+
try {
|
|
334
|
+
const result = await item.task();
|
|
335
|
+
item.resolve(result);
|
|
336
|
+
}
|
|
337
|
+
catch (err) {
|
|
338
|
+
item.reject(err);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
this.running = false;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
const requestQueue = new RequestQueue();
|
|
134
345
|
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
135
346
|
function printLaunchScreen() {
|
|
136
347
|
const screen = `
|
|
@@ -198,6 +409,21 @@ function extractRequestedRows(query, requirements) {
|
|
|
198
409
|
if (Number.isFinite(n) && n > 0)
|
|
199
410
|
return n;
|
|
200
411
|
}
|
|
412
|
+
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
413
|
+
.map(m => Number(m[0].replace(/,/g, "")))
|
|
414
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
415
|
+
if (commaNumbers.length > 0)
|
|
416
|
+
return Math.max(...commaNumbers);
|
|
417
|
+
const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
418
|
+
.map(m => {
|
|
419
|
+
const base = Number(m[1]);
|
|
420
|
+
const suffix = m[2].toLowerCase();
|
|
421
|
+
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
422
|
+
return Math.round(base * multiplier);
|
|
423
|
+
})
|
|
424
|
+
.filter(n => Number.isFinite(n) && n > 0);
|
|
425
|
+
if (humanSized.length > 0)
|
|
426
|
+
return Math.max(...humanSized);
|
|
201
427
|
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
202
428
|
.map(m => Number(m[0]))
|
|
203
429
|
.filter(n => Number.isFinite(n) && n > 0);
|
|
@@ -367,7 +593,45 @@ function syncPythonScripts(appRoot, dataRoot) {
|
|
|
367
593
|
}
|
|
368
594
|
// Sync scripts immediately
|
|
369
595
|
syncPythonScripts(appRoot, dataRoot);
|
|
370
|
-
|
|
596
|
+
// Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
|
|
597
|
+
function tryRebuildSqlite() {
|
|
598
|
+
try {
|
|
599
|
+
const { execSync } = require("child_process");
|
|
600
|
+
const pkgRoot = path.resolve(__dirname, "..");
|
|
601
|
+
console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
|
|
602
|
+
execSync("npm rebuild better-sqlite3", {
|
|
603
|
+
stdio: "pipe",
|
|
604
|
+
timeout: 60000,
|
|
605
|
+
cwd: pkgRoot,
|
|
606
|
+
});
|
|
607
|
+
console.error("[Vesper] Rebuild succeeded. Retrying...");
|
|
608
|
+
// Clear require cache so the rebuilt module is loaded
|
|
609
|
+
for (const key of Object.keys(require.cache)) {
|
|
610
|
+
if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
|
|
611
|
+
delete require.cache[key];
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
return true;
|
|
615
|
+
}
|
|
616
|
+
catch (e) {
|
|
617
|
+
console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
|
|
618
|
+
return false;
|
|
619
|
+
}
|
|
620
|
+
}
|
|
621
|
+
let metadataStore;
|
|
622
|
+
try {
|
|
623
|
+
metadataStore = new MetadataStore(dbPath);
|
|
624
|
+
}
|
|
625
|
+
catch (e) {
|
|
626
|
+
if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
|
|
627
|
+
metadataStore = new MetadataStore(dbPath);
|
|
628
|
+
}
|
|
629
|
+
else {
|
|
630
|
+
console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
|
|
631
|
+
console.error("[Vesper] Run: npm rebuild better-sqlite3");
|
|
632
|
+
throw e;
|
|
633
|
+
}
|
|
634
|
+
}
|
|
371
635
|
const vectorStore = new VectorStore(vectorPath);
|
|
372
636
|
const embedder = Embedder.getInstance();
|
|
373
637
|
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
@@ -382,6 +646,8 @@ const dataSplitter = new DataSplitter(__dirname);
|
|
|
382
646
|
const dataExporter = new DataExporter(__dirname);
|
|
383
647
|
const fusionEngine = new DataFusionEngine(__dirname);
|
|
384
648
|
const kaggleSource = new KaggleSource(__dirname);
|
|
649
|
+
const openmlSource = new OpenMLSource(__dirname);
|
|
650
|
+
const dataworldSource = new DataWorldSource(__dirname);
|
|
385
651
|
const secureKeys = new SecureKeysManager(__dirname);
|
|
386
652
|
function hydrateExternalKeys() {
|
|
387
653
|
const keys = secureKeys.getAll();
|
|
@@ -401,6 +667,15 @@ function hydrateExternalKeys() {
|
|
|
401
667
|
function hasDataWorldToken() {
|
|
402
668
|
return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
|
|
403
669
|
}
|
|
670
|
+
const unifiedDatasetGateway = new UnifiedDatasetGateway({
|
|
671
|
+
metadataStore,
|
|
672
|
+
dataIngestor,
|
|
673
|
+
dataRoot,
|
|
674
|
+
kaggleSource,
|
|
675
|
+
openmlSource,
|
|
676
|
+
dataworldSource,
|
|
677
|
+
hasDataWorldToken,
|
|
678
|
+
});
|
|
404
679
|
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
405
680
|
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
406
681
|
// NOT relative to project root (appRoot)
|
|
@@ -432,7 +707,7 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
432
707
|
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
433
708
|
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
434
709
|
switch (job.type) {
|
|
435
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements);
|
|
710
|
+
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
|
|
436
711
|
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
437
712
|
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
438
713
|
}
|
|
@@ -450,9 +725,21 @@ jobManager.on("processJob", async (job, execute) => {
|
|
|
450
725
|
/**
|
|
451
726
|
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
452
727
|
*/
|
|
453
|
-
async function handlePrepareJob(jobId, query, requirements) {
|
|
728
|
+
async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
454
729
|
hydrateExternalKeys();
|
|
455
730
|
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
731
|
+
const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
|
|
732
|
+
const stepStatus = {};
|
|
733
|
+
for (const s of pipelineSteps)
|
|
734
|
+
stepStatus[s] = "pending";
|
|
735
|
+
const markPipelineStep = (step, status) => {
|
|
736
|
+
stepStatus[step] = status;
|
|
737
|
+
const summary = pipelineSteps.map(s => {
|
|
738
|
+
const st = stepStatus[s];
|
|
739
|
+
return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
|
|
740
|
+
}).join(" → ");
|
|
741
|
+
console.error(`[Pipeline] ${summary}`);
|
|
742
|
+
};
|
|
456
743
|
// Ensure core Python packages are available for dataset operations
|
|
457
744
|
try {
|
|
458
745
|
await ensurePythonModules([
|
|
@@ -465,6 +752,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
465
752
|
// Continue anyway - direct file downloads may still work without datasets lib
|
|
466
753
|
}
|
|
467
754
|
const requestedRows = extractRequestedRows(query, requirements);
|
|
755
|
+
const searchQuery = requirements ? `${query} ${requirements}` : query;
|
|
468
756
|
let selectedDataset;
|
|
469
757
|
let datasetIdForDownload = "";
|
|
470
758
|
let source;
|
|
@@ -500,11 +788,14 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
500
788
|
progress: 20,
|
|
501
789
|
status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
|
|
502
790
|
});
|
|
791
|
+
markPipelineStep("search", "skipped");
|
|
503
792
|
}
|
|
504
793
|
else {
|
|
794
|
+
markPipelineStep("search", "running");
|
|
505
795
|
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
506
|
-
const results = await searchEngine.search(
|
|
796
|
+
const results = await searchEngine.search(searchQuery, { limit: 10 });
|
|
507
797
|
if (results.length === 0) {
|
|
798
|
+
markPipelineStep("search", "failed");
|
|
508
799
|
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
509
800
|
}
|
|
510
801
|
// Pick the best result that we can actually download (skip sources requiring missing credentials)
|
|
@@ -524,8 +815,10 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
524
815
|
progress: 20,
|
|
525
816
|
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
526
817
|
});
|
|
818
|
+
markPipelineStep("search", "done");
|
|
527
819
|
}
|
|
528
820
|
// Pre-check credentials for sources that require them
|
|
821
|
+
markPipelineStep("validate", "running");
|
|
529
822
|
if (source === "kaggle") {
|
|
530
823
|
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
531
824
|
if (!hasKaggleCreds) {
|
|
@@ -533,8 +826,11 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
533
826
|
}
|
|
534
827
|
}
|
|
535
828
|
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
829
|
+
markPipelineStep("validate", "failed");
|
|
536
830
|
throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
|
|
537
831
|
}
|
|
832
|
+
markPipelineStep("validate", "done");
|
|
833
|
+
markPipelineStep("download", "running");
|
|
538
834
|
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
539
835
|
// ensureData handles download and returns path to the raw file
|
|
540
836
|
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
@@ -545,7 +841,7 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
545
841
|
let currentRows = await countRows(rawFilePath);
|
|
546
842
|
if (currentRows < requestedRows) {
|
|
547
843
|
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
548
|
-
const additional = await searchEngine.search(
|
|
844
|
+
const additional = await searchEngine.search(searchQuery, { limit: 8 });
|
|
549
845
|
const sourceFiles = [rawFilePath];
|
|
550
846
|
let totalRows = currentRows;
|
|
551
847
|
for (const ds of additional) {
|
|
@@ -597,15 +893,50 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
597
893
|
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
598
894
|
}
|
|
599
895
|
}
|
|
896
|
+
markPipelineStep("download", "done");
|
|
897
|
+
// ── Normalize step: convert any raw format → parquet ──
|
|
898
|
+
markPipelineStep("normalize", "running");
|
|
899
|
+
const rawExt = path.extname(rawFilePath).toLowerCase();
|
|
900
|
+
if (rawExt !== ".parquet" && rawExt !== ".pq") {
|
|
901
|
+
update({ progress: 70, status_text: "Normalizing to parquet..." });
|
|
902
|
+
const normalizedDir = path.join(dataRoot, "data", "normalized");
|
|
903
|
+
if (!fs.existsSync(normalizedDir))
|
|
904
|
+
fs.mkdirSync(normalizedDir, { recursive: true });
|
|
905
|
+
const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
|
|
906
|
+
const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
|
|
907
|
+
try {
|
|
908
|
+
const normScript = path.join(dataRoot, "python", "normalize_engine.py");
|
|
909
|
+
const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
|
|
910
|
+
if (normResult.ok && fs.existsSync(normalizedPath)) {
|
|
911
|
+
console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
|
|
912
|
+
rawFilePath = normalizedPath;
|
|
913
|
+
markPipelineStep("normalize", "done");
|
|
914
|
+
}
|
|
915
|
+
else {
|
|
916
|
+
console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
|
|
917
|
+
markPipelineStep("normalize", "skipped");
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
catch (e) {
|
|
921
|
+
console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
|
|
922
|
+
markPipelineStep("normalize", "skipped");
|
|
923
|
+
}
|
|
924
|
+
}
|
|
925
|
+
else {
|
|
926
|
+
markPipelineStep("normalize", "done");
|
|
927
|
+
}
|
|
600
928
|
let qualityScore = selectedDataset?.quality_score ?? 70;
|
|
601
|
-
|
|
929
|
+
markPipelineStep("quality", "running");
|
|
930
|
+
update({ progress: 75, status_text: "Analyzing dataset quality..." });
|
|
602
931
|
try {
|
|
603
932
|
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
604
933
|
qualityScore = report.overall_score;
|
|
934
|
+
markPipelineStep("quality", "done");
|
|
605
935
|
}
|
|
606
936
|
catch (error) {
|
|
607
937
|
console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
|
|
608
938
|
update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
|
|
939
|
+
markPipelineStep("quality", "skipped");
|
|
609
940
|
}
|
|
610
941
|
if (selectedDataset) {
|
|
611
942
|
metadataStore.saveDataset({
|
|
@@ -613,15 +944,62 @@ async function handlePrepareJob(jobId, query, requirements) {
|
|
|
613
944
|
quality_score: qualityScore
|
|
614
945
|
});
|
|
615
946
|
}
|
|
947
|
+
else {
|
|
948
|
+
// Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
|
|
949
|
+
try {
|
|
950
|
+
const existingMeta = metadataStore.getDataset(datasetIdForDownload);
|
|
951
|
+
if (!existingMeta) {
|
|
952
|
+
metadataStore.saveDataset({
|
|
953
|
+
id: datasetIdForDownload,
|
|
954
|
+
source: source,
|
|
955
|
+
name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
|
|
956
|
+
description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
|
|
957
|
+
quality_warnings: [],
|
|
958
|
+
downloads: 0,
|
|
959
|
+
likes: 0,
|
|
960
|
+
stars: 0,
|
|
961
|
+
tags: [],
|
|
962
|
+
last_updated: new Date().toISOString(),
|
|
963
|
+
task: "unknown",
|
|
964
|
+
domain: "unknown",
|
|
965
|
+
languages: [],
|
|
966
|
+
splits: [],
|
|
967
|
+
license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
|
|
968
|
+
quality_score: qualityScore,
|
|
969
|
+
download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
|
|
970
|
+
total_examples: 0,
|
|
971
|
+
is_structured: false,
|
|
972
|
+
has_target_column: false,
|
|
973
|
+
is_safe_source: true,
|
|
974
|
+
has_personal_data: false,
|
|
975
|
+
is_paywalled: false,
|
|
976
|
+
is_scraped_web_data: false,
|
|
977
|
+
uses_https: true,
|
|
978
|
+
has_train_split: false,
|
|
979
|
+
has_test_split: false,
|
|
980
|
+
has_validation_split: false,
|
|
981
|
+
description_length: 0,
|
|
982
|
+
has_readme: false,
|
|
983
|
+
});
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
catch (e) {
|
|
987
|
+
console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
markPipelineStep("register", "running");
|
|
616
991
|
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
617
|
-
const installPath = await installService.install(datasetIdForDownload, rawFilePath);
|
|
992
|
+
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
618
993
|
update({ progress: 100, status_text: "Preparation complete!" });
|
|
619
994
|
// Register prepared dataset in local registry for lookup by export/list tools
|
|
620
995
|
try {
|
|
621
996
|
upsertRegistry(datasetIdForDownload, installPath, "completed");
|
|
997
|
+
markPipelineStep("register", "done");
|
|
998
|
+
markStepComplete(datasetIdForDownload, "prepare");
|
|
622
999
|
}
|
|
623
1000
|
catch (e) {
|
|
624
1001
|
console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
1002
|
+
markPipelineStep("register", "failed");
|
|
625
1003
|
}
|
|
626
1004
|
return installPath;
|
|
627
1005
|
}
|
|
@@ -647,7 +1025,7 @@ async function handleCleanJob(jobId, datasetId, ops) {
|
|
|
647
1025
|
}
|
|
648
1026
|
// 3. Check standard raw data paths
|
|
649
1027
|
if (!filePath) {
|
|
650
|
-
const safeId = datasetId
|
|
1028
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
651
1029
|
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
652
1030
|
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
653
1031
|
const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
|
|
@@ -712,9 +1090,57 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
712
1090
|
required: ["query"],
|
|
713
1091
|
},
|
|
714
1092
|
},
|
|
1093
|
+
{
|
|
1094
|
+
name: "unified_dataset_api",
|
|
1095
|
+
description: "Single facade over multiple external dataset providers. Supports provider discovery, dataset search, dataset download, and dataset info through one MCP tool using public access and server-managed credentials when available.",
|
|
1096
|
+
inputSchema: {
|
|
1097
|
+
type: "object",
|
|
1098
|
+
properties: {
|
|
1099
|
+
operation: {
|
|
1100
|
+
type: "string",
|
|
1101
|
+
enum: ["providers", "discover", "download", "info"],
|
|
1102
|
+
description: "Gateway operation to execute.",
|
|
1103
|
+
},
|
|
1104
|
+
source: {
|
|
1105
|
+
type: "string",
|
|
1106
|
+
enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "s3", "bigquery"],
|
|
1107
|
+
description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
|
|
1108
|
+
},
|
|
1109
|
+
query: {
|
|
1110
|
+
type: "string",
|
|
1111
|
+
description: "Dataset discovery query. Required for operation='discover'.",
|
|
1112
|
+
},
|
|
1113
|
+
dataset_id: {
|
|
1114
|
+
type: "string",
|
|
1115
|
+
description: "Dataset identifier or object reference. Required for operation='download' and operation='info'. Supports prefixed ids like 'huggingface:user/dataset' and public S3 URIs like 's3://bucket/key'.",
|
|
1116
|
+
},
|
|
1117
|
+
limit: {
|
|
1118
|
+
type: "number",
|
|
1119
|
+
description: "Max results for operation='discover' (default: 10).",
|
|
1120
|
+
},
|
|
1121
|
+
target_dir: {
|
|
1122
|
+
type: "string",
|
|
1123
|
+
description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
|
|
1124
|
+
},
|
|
1125
|
+
output_dir: {
|
|
1126
|
+
type: "string",
|
|
1127
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1128
|
+
},
|
|
1129
|
+
public_only: {
|
|
1130
|
+
type: "boolean",
|
|
1131
|
+
description: "When true, discover/info stay on public providers only unless a specific source is requested.",
|
|
1132
|
+
},
|
|
1133
|
+
include_unavailable: {
|
|
1134
|
+
type: "boolean",
|
|
1135
|
+
description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
|
|
1136
|
+
},
|
|
1137
|
+
},
|
|
1138
|
+
required: ["operation"],
|
|
1139
|
+
},
|
|
1140
|
+
},
|
|
715
1141
|
{
|
|
716
1142
|
name: "discover_datasets",
|
|
717
|
-
description: "Discover datasets from a specific source.
|
|
1143
|
+
description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
|
|
718
1144
|
inputSchema: {
|
|
719
1145
|
type: "object",
|
|
720
1146
|
properties: {
|
|
@@ -737,7 +1163,7 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
737
1163
|
},
|
|
738
1164
|
{
|
|
739
1165
|
name: "download_dataset",
|
|
740
|
-
description: "Download a dataset by source and ID/slug into local
|
|
1166
|
+
description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
|
|
741
1167
|
inputSchema: {
|
|
742
1168
|
type: "object",
|
|
743
1169
|
properties: {
|
|
@@ -752,7 +1178,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
752
1178
|
},
|
|
753
1179
|
target_dir: {
|
|
754
1180
|
type: "string",
|
|
755
|
-
description: "Optional target directory for downloaded files.",
|
|
1181
|
+
description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
|
|
1182
|
+
},
|
|
1183
|
+
output_dir: {
|
|
1184
|
+
type: "string",
|
|
1185
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
756
1186
|
}
|
|
757
1187
|
},
|
|
758
1188
|
required: ["dataset_id"],
|
|
@@ -760,19 +1190,21 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
760
1190
|
},
|
|
761
1191
|
{
|
|
762
1192
|
name: "vesper_download_assets",
|
|
763
|
-
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL).",
|
|
1193
|
+
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL). Auto-detects image columns from HF feature types, column names, and value patterns. Supports PIL Images, URL-based images, and binary image data.",
|
|
764
1194
|
inputSchema: {
|
|
765
1195
|
type: "object",
|
|
766
1196
|
properties: {
|
|
767
|
-
dataset_id: { type: "string", description: "Unique dataset identifier." },
|
|
1197
|
+
dataset_id: { type: "string", description: "Unique dataset identifier (e.g. 'user/dataset')." },
|
|
768
1198
|
source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
|
|
769
|
-
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g.
|
|
1199
|
+
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. 'user/dataset'). Auto-inferred from dataset_id if omitted." },
|
|
770
1200
|
kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
|
|
771
1201
|
urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
|
|
772
1202
|
output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
|
|
1203
|
+
target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
|
|
1204
|
+
output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
|
|
773
1205
|
max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
|
|
774
1206
|
workers: { type: "number", description: "Parallel worker count (default 8)." },
|
|
775
|
-
image_column: { type: "string", description: "
|
|
1207
|
+
image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
|
|
776
1208
|
},
|
|
777
1209
|
required: ["dataset_id", "source"],
|
|
778
1210
|
},
|
|
@@ -877,6 +1309,8 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
877
1309
|
properties: {
|
|
878
1310
|
query: { type: "string" },
|
|
879
1311
|
requirements: { type: "string" },
|
|
1312
|
+
target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
|
|
1313
|
+
output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
|
|
880
1314
|
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
881
1315
|
cleaning_options: { type: "object" },
|
|
882
1316
|
split_config: { type: "object" },
|
|
@@ -921,7 +1355,11 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
921
1355
|
},
|
|
922
1356
|
target_dir: {
|
|
923
1357
|
type: "string",
|
|
924
|
-
description: "Optional custom local directory for export
|
|
1358
|
+
description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
|
|
1359
|
+
},
|
|
1360
|
+
output_dir: {
|
|
1361
|
+
type: "string",
|
|
1362
|
+
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
925
1363
|
},
|
|
926
1364
|
format: {
|
|
927
1365
|
type: "string",
|
|
@@ -962,6 +1400,25 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
962
1400
|
properties: {},
|
|
963
1401
|
},
|
|
964
1402
|
},
|
|
1403
|
+
{
|
|
1404
|
+
name: "vesper_convert_format",
|
|
1405
|
+
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
1406
|
+
inputSchema: {
|
|
1407
|
+
type: "object",
|
|
1408
|
+
properties: {
|
|
1409
|
+
file_path: {
|
|
1410
|
+
type: "string",
|
|
1411
|
+
description: "Absolute path to the input dataset file.",
|
|
1412
|
+
},
|
|
1413
|
+
target_format: {
|
|
1414
|
+
type: "string",
|
|
1415
|
+
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1416
|
+
description: "The desired output format.",
|
|
1417
|
+
},
|
|
1418
|
+
},
|
|
1419
|
+
required: ["file_path", "target_format"],
|
|
1420
|
+
},
|
|
1421
|
+
},
|
|
965
1422
|
{
|
|
966
1423
|
name: "fuse_datasets",
|
|
967
1424
|
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
@@ -1069,912 +1526,1112 @@ server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
|
1069
1526
|
],
|
|
1070
1527
|
};
|
|
1071
1528
|
});
|
|
1072
|
-
// Call Tool
|
|
1529
|
+
// Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
|
|
1073
1530
|
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1531
|
+
return requestQueue.enqueue(async () => {
|
|
1532
|
+
// --- Pipeline Enforcement ---
|
|
1533
|
+
// Map tool names to pipeline steps
|
|
1534
|
+
const toolToStep = {
|
|
1535
|
+
vesper_search: "search",
|
|
1536
|
+
vesper_download: "download",
|
|
1537
|
+
vesper_analyze: "analyze",
|
|
1538
|
+
vesper_clean: "clean",
|
|
1539
|
+
vesper_split: "split",
|
|
1540
|
+
vesper_export: "export",
|
|
1541
|
+
prepare_dataset: "prepare",
|
|
1542
|
+
};
|
|
1543
|
+
// Extract dataset_id if present and normalize
|
|
1544
|
+
let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
|
|
1545
|
+
if (datasetId)
|
|
1546
|
+
datasetId = parseDatasetId(String(datasetId));
|
|
1547
|
+
// Pipeline rules
|
|
1548
|
+
const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
|
|
1549
|
+
const prereqs = {
|
|
1550
|
+
vesper_download: ["search"],
|
|
1551
|
+
vesper_analyze: ["download"],
|
|
1552
|
+
vesper_clean: ["analyze"],
|
|
1553
|
+
vesper_split: ["clean"],
|
|
1554
|
+
vesper_export: ["split"],
|
|
1555
|
+
};
|
|
1556
|
+
const tool = String(request.params.name);
|
|
1557
|
+
const step = toolToStep[tool];
|
|
1558
|
+
if (step && datasetId) {
|
|
1559
|
+
// Check prerequisites
|
|
1560
|
+
const required = prereqs[tool] || [];
|
|
1561
|
+
for (const req of required) {
|
|
1562
|
+
if (!hasStep(String(datasetId), req)) {
|
|
1563
|
+
// Auto-run missing step if possible, else error
|
|
1564
|
+
// For export, auto-run prepare_dataset if split missing
|
|
1565
|
+
if (tool === "vesper_export" && req === "split") {
|
|
1566
|
+
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1567
|
+
try {
|
|
1568
|
+
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
1569
|
+
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1570
|
+
markStepComplete(String(datasetId), "split");
|
|
1571
|
+
}
|
|
1572
|
+
catch (e) {
|
|
1573
|
+
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1574
|
+
return {
|
|
1575
|
+
content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
|
|
1576
|
+
isError: true,
|
|
1577
|
+
};
|
|
1578
|
+
}
|
|
1113
1579
|
}
|
|
1114
|
-
|
|
1115
|
-
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1580
|
+
else {
|
|
1116
1581
|
return {
|
|
1117
|
-
content: [{ type: "text", text: `ERROR:
|
|
1582
|
+
content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
|
|
1118
1583
|
isError: true,
|
|
1119
1584
|
};
|
|
1120
1585
|
}
|
|
1121
1586
|
}
|
|
1122
|
-
else {
|
|
1123
|
-
return {
|
|
1124
|
-
content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
|
|
1125
|
-
isError: true,
|
|
1126
|
-
};
|
|
1127
|
-
}
|
|
1128
|
-
}
|
|
1129
|
-
}
|
|
1130
|
-
// Mark this step as complete
|
|
1131
|
-
markStepComplete(String(datasetId), String(step));
|
|
1132
|
-
}
|
|
1133
|
-
switch (request.params.name) {
|
|
1134
|
-
case "vesper_search": {
|
|
1135
|
-
const query = String(request.params.arguments?.query);
|
|
1136
|
-
const limit = 5;
|
|
1137
|
-
const safeOnly = true; // Enable safe filter by default
|
|
1138
|
-
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
1139
|
-
if (!query) {
|
|
1140
|
-
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
1141
1587
|
}
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
return {
|
|
1145
|
-
content: [
|
|
1146
|
-
{
|
|
1147
|
-
type: "text",
|
|
1148
|
-
text: formattedOutput,
|
|
1149
|
-
},
|
|
1150
|
-
],
|
|
1151
|
-
};
|
|
1588
|
+
// Mark this step as complete
|
|
1589
|
+
markStepComplete(String(datasetId), String(step));
|
|
1152
1590
|
}
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
let results = [];
|
|
1163
|
-
if (source === "kaggle") {
|
|
1164
|
-
if (!dataIngestor.hasKaggleCredentials()) {
|
|
1591
|
+
switch (request.params.name) {
|
|
1592
|
+
case "unified_dataset_api": {
|
|
1593
|
+
hydrateExternalKeys();
|
|
1594
|
+
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
1595
|
+
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
1596
|
+
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
1597
|
+
const publicOnly = request.params.arguments?.public_only !== false;
|
|
1598
|
+
try {
|
|
1599
|
+
if (operation === "providers") {
|
|
1165
1600
|
return {
|
|
1166
|
-
content: [{ type: "text", text:
|
|
1167
|
-
isError: true,
|
|
1601
|
+
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
1168
1602
|
};
|
|
1169
1603
|
}
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1604
|
+
if (operation === "discover") {
|
|
1605
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1606
|
+
if (!query) {
|
|
1607
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
|
|
1608
|
+
}
|
|
1609
|
+
const result = await unifiedDatasetGateway.discover({
|
|
1610
|
+
query,
|
|
1611
|
+
source,
|
|
1612
|
+
limit: Number(request.params.arguments?.limit || 10),
|
|
1613
|
+
publicOnly,
|
|
1614
|
+
});
|
|
1178
1615
|
return {
|
|
1179
|
-
content: [{ type: "text", text:
|
|
1180
|
-
isError: true,
|
|
1616
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1181
1617
|
};
|
|
1182
1618
|
}
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1619
|
+
if (operation === "download") {
|
|
1620
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1621
|
+
if (!datasetId) {
|
|
1622
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
1623
|
+
}
|
|
1624
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1625
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1626
|
+
: request.params.arguments?.output_dir
|
|
1627
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1628
|
+
: "";
|
|
1629
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1630
|
+
try {
|
|
1631
|
+
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
1632
|
+
}
|
|
1633
|
+
catch {
|
|
1634
|
+
// best effort; non-HF providers do not require this
|
|
1635
|
+
}
|
|
1636
|
+
const result = await unifiedDatasetGateway.download({
|
|
1637
|
+
datasetId,
|
|
1638
|
+
source,
|
|
1639
|
+
targetDir,
|
|
1640
|
+
});
|
|
1641
|
+
try {
|
|
1642
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
1643
|
+
}
|
|
1644
|
+
catch (e) {
|
|
1645
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1646
|
+
}
|
|
1647
|
+
return {
|
|
1648
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1649
|
+
};
|
|
1203
1650
|
}
|
|
1204
|
-
|
|
1205
|
-
|
|
1651
|
+
if (operation === "info") {
|
|
1652
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1653
|
+
if (!datasetId) {
|
|
1654
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
|
|
1655
|
+
}
|
|
1656
|
+
const result = await unifiedDatasetGateway.info({
|
|
1657
|
+
datasetId,
|
|
1658
|
+
source,
|
|
1659
|
+
publicOnly,
|
|
1660
|
+
});
|
|
1661
|
+
return {
|
|
1662
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1663
|
+
};
|
|
1206
1664
|
}
|
|
1665
|
+
throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
|
|
1666
|
+
}
|
|
1667
|
+
catch (error) {
|
|
1668
|
+
return {
|
|
1669
|
+
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
1670
|
+
isError: true,
|
|
1671
|
+
};
|
|
1207
1672
|
}
|
|
1208
|
-
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
1209
|
-
return {
|
|
1210
|
-
content: [{ type: "text", text: formattedOutput }]
|
|
1211
|
-
};
|
|
1212
|
-
}
|
|
1213
|
-
catch (error) {
|
|
1214
|
-
return {
|
|
1215
|
-
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
1216
|
-
isError: true,
|
|
1217
|
-
};
|
|
1218
|
-
}
|
|
1219
|
-
}
|
|
1220
|
-
case "download_dataset": {
|
|
1221
|
-
hydrateExternalKeys();
|
|
1222
|
-
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1223
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1224
|
-
if (!datasetId) {
|
|
1225
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1226
|
-
}
|
|
1227
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1228
|
-
return {
|
|
1229
|
-
content: [{ type: "text", text: `Kaggle requires API credentials. Use the configure_keys tool to set kaggle_username and kaggle_key, or switch to source='huggingface' which works without credentials.` }],
|
|
1230
|
-
isError: true,
|
|
1231
|
-
};
|
|
1232
1673
|
}
|
|
1233
|
-
|
|
1674
|
+
case "vesper_search": {
|
|
1675
|
+
const query = String(request.params.arguments?.query);
|
|
1676
|
+
const limit = 5;
|
|
1677
|
+
const safeOnly = true; // Enable safe filter by default
|
|
1678
|
+
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
1679
|
+
if (!query) {
|
|
1680
|
+
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
1681
|
+
}
|
|
1682
|
+
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
1683
|
+
const formattedOutput = formatSearchResults(results);
|
|
1234
1684
|
return {
|
|
1235
|
-
content: [
|
|
1236
|
-
|
|
1685
|
+
content: [
|
|
1686
|
+
{
|
|
1687
|
+
type: "text",
|
|
1688
|
+
text: formattedOutput,
|
|
1689
|
+
},
|
|
1690
|
+
],
|
|
1237
1691
|
};
|
|
1238
1692
|
}
|
|
1239
|
-
|
|
1240
|
-
|
|
1693
|
+
case "discover_datasets": {
|
|
1694
|
+
hydrateExternalKeys();
|
|
1695
|
+
const query = String(request.params.arguments?.query || "").trim();
|
|
1696
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1697
|
+
const limit = Number(request.params.arguments?.limit || 10);
|
|
1698
|
+
if (!query) {
|
|
1699
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
1700
|
+
}
|
|
1241
1701
|
try {
|
|
1242
|
-
await
|
|
1243
|
-
|
|
1244
|
-
|
|
1702
|
+
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
1703
|
+
query,
|
|
1704
|
+
source,
|
|
1705
|
+
limit,
|
|
1706
|
+
publicOnly: false,
|
|
1707
|
+
});
|
|
1708
|
+
const results = gatewayResult.results;
|
|
1709
|
+
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1710
|
+
for (const ds of results.slice(0, limit)) {
|
|
1711
|
+
const info = {
|
|
1712
|
+
dataset_id: ds.id,
|
|
1713
|
+
id: ds.id,
|
|
1714
|
+
source: ds.source,
|
|
1715
|
+
repo_id: ds.id,
|
|
1716
|
+
total_images: ds.total_examples || 0,
|
|
1717
|
+
image_column: undefined,
|
|
1718
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1719
|
+
};
|
|
1720
|
+
try {
|
|
1721
|
+
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
1722
|
+
}
|
|
1723
|
+
catch {
|
|
1724
|
+
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
1725
|
+
}
|
|
1726
|
+
}
|
|
1727
|
+
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
1728
|
+
const noteBlock = gatewayResult.notes.length > 0
|
|
1729
|
+
? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
|
|
1730
|
+
: "";
|
|
1731
|
+
return {
|
|
1732
|
+
content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
|
|
1733
|
+
};
|
|
1245
1734
|
}
|
|
1246
|
-
catch {
|
|
1247
|
-
|
|
1735
|
+
catch (error) {
|
|
1736
|
+
return {
|
|
1737
|
+
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
1738
|
+
isError: true,
|
|
1739
|
+
};
|
|
1248
1740
|
}
|
|
1249
1741
|
}
|
|
1250
|
-
|
|
1251
|
-
|
|
1742
|
+
case "download_dataset": {
|
|
1743
|
+
hydrateExternalKeys();
|
|
1744
|
+
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1745
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1746
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1747
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1748
|
+
: request.params.arguments?.output_dir
|
|
1749
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1750
|
+
: "";
|
|
1751
|
+
const targetDir = requestedTargetDir || process.cwd();
|
|
1752
|
+
if (!datasetId) {
|
|
1753
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1754
|
+
}
|
|
1755
|
+
// Pre-install Python datasets library for HuggingFace fallback
|
|
1756
|
+
if (source === "huggingface") {
|
|
1757
|
+
try {
|
|
1758
|
+
await ensurePythonModules([
|
|
1759
|
+
{ module: "datasets", packageName: "datasets" },
|
|
1760
|
+
]);
|
|
1761
|
+
}
|
|
1762
|
+
catch {
|
|
1763
|
+
// Continue - direct download may still work
|
|
1764
|
+
}
|
|
1765
|
+
}
|
|
1252
1766
|
try {
|
|
1253
|
-
|
|
1767
|
+
const result = await unifiedDatasetGateway.download({
|
|
1768
|
+
datasetId,
|
|
1769
|
+
source,
|
|
1770
|
+
targetDir,
|
|
1771
|
+
});
|
|
1772
|
+
try {
|
|
1773
|
+
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
1774
|
+
}
|
|
1775
|
+
catch (e) {
|
|
1776
|
+
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1777
|
+
}
|
|
1778
|
+
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
1779
|
+
return {
|
|
1780
|
+
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
1781
|
+
};
|
|
1254
1782
|
}
|
|
1255
|
-
catch (
|
|
1256
|
-
|
|
1783
|
+
catch (error) {
|
|
1784
|
+
return {
|
|
1785
|
+
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
1786
|
+
isError: true,
|
|
1787
|
+
};
|
|
1257
1788
|
}
|
|
1258
|
-
return {
|
|
1259
|
-
content: [{ type: "text", text: `Download complete: ${localPath}` }]
|
|
1260
|
-
};
|
|
1261
|
-
}
|
|
1262
|
-
catch (error) {
|
|
1263
|
-
return {
|
|
1264
|
-
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
1265
|
-
isError: true,
|
|
1266
|
-
};
|
|
1267
|
-
}
|
|
1268
|
-
}
|
|
1269
|
-
case "vesper_download_assets": {
|
|
1270
|
-
hydrateExternalKeys();
|
|
1271
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1272
|
-
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
1273
|
-
const repoId = request.params.arguments?.repo_id ? String(request.params.arguments.repo_id) : undefined;
|
|
1274
|
-
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
1275
|
-
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1276
|
-
? (request.params.arguments?.urls).map(v => String(v))
|
|
1277
|
-
: undefined;
|
|
1278
|
-
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
1279
|
-
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
1280
|
-
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1281
|
-
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1282
|
-
if (!datasetId || !source) {
|
|
1283
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1284
|
-
}
|
|
1285
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1286
|
-
return {
|
|
1287
|
-
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
1288
|
-
isError: true,
|
|
1289
|
-
};
|
|
1290
|
-
}
|
|
1291
|
-
const requiredModules = [
|
|
1292
|
-
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1293
|
-
];
|
|
1294
|
-
if (source === "url") {
|
|
1295
|
-
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1296
1789
|
}
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
image_column: imageColumn,
|
|
1323
|
-
output_root: path.join(dataRoot, "data", "assets"),
|
|
1324
|
-
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1325
|
-
};
|
|
1326
|
-
try {
|
|
1327
|
-
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1328
|
-
if (!result?.ok) {
|
|
1790
|
+
case "vesper_download_assets": {
|
|
1791
|
+
hydrateExternalKeys();
|
|
1792
|
+
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1793
|
+
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
1794
|
+
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
1795
|
+
const repoId = request.params.arguments?.repo_id
|
|
1796
|
+
? String(request.params.arguments.repo_id)
|
|
1797
|
+
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
1798
|
+
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
1799
|
+
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1800
|
+
? (request.params.arguments?.urls).map(v => String(v))
|
|
1801
|
+
: undefined;
|
|
1802
|
+
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
1803
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
1804
|
+
? String(request.params.arguments.target_dir).trim()
|
|
1805
|
+
: request.params.arguments?.output_dir
|
|
1806
|
+
? String(request.params.arguments.output_dir).trim()
|
|
1807
|
+
: undefined;
|
|
1808
|
+
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
1809
|
+
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1810
|
+
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1811
|
+
if (!datasetId || !source) {
|
|
1812
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1813
|
+
}
|
|
1814
|
+
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1329
1815
|
return {
|
|
1330
|
-
content: [{ type: "text", text:
|
|
1816
|
+
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
1331
1817
|
isError: true,
|
|
1332
1818
|
};
|
|
1333
1819
|
}
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
return {
|
|
1340
|
-
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
1341
|
-
isError: true,
|
|
1342
|
-
};
|
|
1343
|
-
}
|
|
1344
|
-
}
|
|
1345
|
-
case "configure_kaggle": {
|
|
1346
|
-
const username = String(request.params.arguments?.username || "").trim();
|
|
1347
|
-
const key = String(request.params.arguments?.key || "").trim();
|
|
1348
|
-
if (!username || !key) {
|
|
1349
|
-
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
1350
|
-
}
|
|
1351
|
-
const r1 = secureKeys.set("kaggle_username", username);
|
|
1352
|
-
const r2 = secureKeys.set("kaggle_key", key);
|
|
1353
|
-
process.env.KAGGLE_USERNAME = username;
|
|
1354
|
-
process.env.KAGGLE_KEY = key;
|
|
1355
|
-
return {
|
|
1356
|
-
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1357
|
-
};
|
|
1358
|
-
}
|
|
1359
|
-
case "configure_keys": {
|
|
1360
|
-
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
1361
|
-
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
1362
|
-
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
1363
|
-
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
1364
|
-
const saved = [];
|
|
1365
|
-
const methods = [];
|
|
1366
|
-
if (hfToken) {
|
|
1367
|
-
const r = secureKeys.set("hf_token", hfToken);
|
|
1368
|
-
if (r.ok) {
|
|
1369
|
-
process.env.HF_TOKEN = hfToken;
|
|
1370
|
-
saved.push("HF token");
|
|
1371
|
-
if (r.method)
|
|
1372
|
-
methods.push(r.method);
|
|
1820
|
+
const requiredModules = [
|
|
1821
|
+
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1822
|
+
];
|
|
1823
|
+
if (source === "url") {
|
|
1824
|
+
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1373
1825
|
}
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
if (r.ok) {
|
|
1378
|
-
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1379
|
-
saved.push("Kaggle username");
|
|
1380
|
-
if (r.method)
|
|
1381
|
-
methods.push(r.method);
|
|
1826
|
+
if (source === "huggingface") {
|
|
1827
|
+
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
1828
|
+
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
1382
1829
|
}
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
1386
|
-
if (r.ok) {
|
|
1387
|
-
process.env.KAGGLE_KEY = kaggleKey;
|
|
1388
|
-
saved.push("Kaggle key");
|
|
1389
|
-
if (r.method)
|
|
1390
|
-
methods.push(r.method);
|
|
1830
|
+
if (source === "kaggle") {
|
|
1831
|
+
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
1391
1832
|
}
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
1395
|
-
if (r.ok) {
|
|
1396
|
-
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
1397
|
-
saved.push("data.world token");
|
|
1398
|
-
if (r.method)
|
|
1399
|
-
methods.push(r.method);
|
|
1833
|
+
try {
|
|
1834
|
+
await ensurePythonModules(requiredModules);
|
|
1400
1835
|
}
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1836
|
+
catch (error) {
|
|
1837
|
+
return {
|
|
1838
|
+
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
1839
|
+
isError: true,
|
|
1840
|
+
};
|
|
1841
|
+
}
|
|
1842
|
+
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1843
|
+
const payload = {
|
|
1844
|
+
dataset_id: datasetId,
|
|
1845
|
+
source,
|
|
1846
|
+
repo_id: repoId,
|
|
1847
|
+
kaggle_ref: kaggleRef,
|
|
1848
|
+
urls,
|
|
1849
|
+
output_format: outputFormat,
|
|
1850
|
+
output_dir: requestedOutputDir,
|
|
1851
|
+
max_items: maxItems,
|
|
1852
|
+
workers,
|
|
1853
|
+
image_column: imageColumn,
|
|
1854
|
+
output_root: requestedOutputDir || process.cwd(),
|
|
1855
|
+
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1405
1856
|
};
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
const formattedOutput = formatDatasetInfo(dataset);
|
|
1424
|
-
return { content: [{ type: "text", text: formattedOutput }] };
|
|
1425
|
-
}
|
|
1426
|
-
case "analyze_quality": {
|
|
1427
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1428
|
-
const safeId = datasetId.replace(/\//g, "_");
|
|
1429
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1430
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1431
|
-
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1432
|
-
// Demo Fallback for easy testing
|
|
1433
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1434
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1435
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1436
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
1437
|
-
filePath = demoParquetPath;
|
|
1438
|
-
}
|
|
1439
|
-
else if (fs.existsSync(demoCsvPath)) {
|
|
1440
|
-
filePath = demoCsvPath;
|
|
1441
|
-
}
|
|
1442
|
-
else if (datasetId !== "demo") {
|
|
1857
|
+
try {
|
|
1858
|
+
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1859
|
+
if (!result?.ok) {
|
|
1860
|
+
const errMsg = result?.error || "Unknown error";
|
|
1861
|
+
// Enhance error messages for common failures
|
|
1862
|
+
let hint = "";
|
|
1863
|
+
if (errMsg.includes("No image column")) {
|
|
1864
|
+
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
1865
|
+
}
|
|
1866
|
+
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
1867
|
+
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
1868
|
+
}
|
|
1869
|
+
return {
|
|
1870
|
+
content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
|
|
1871
|
+
isError: true,
|
|
1872
|
+
};
|
|
1873
|
+
}
|
|
1443
1874
|
return {
|
|
1444
|
-
content: [{ type: "text", text:
|
|
1445
|
-
isError: true
|
|
1875
|
+
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
1446
1876
|
};
|
|
1447
1877
|
}
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
}
|
|
1454
|
-
case "preview_cleaning": {
|
|
1455
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1456
|
-
const safeId = datasetId.replace(/\//g, "_");
|
|
1457
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1458
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1459
|
-
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1460
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1461
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1462
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1463
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
1464
|
-
filePath = demoParquetPath;
|
|
1465
|
-
}
|
|
1466
|
-
else if (fs.existsSync(demoCsvPath)) {
|
|
1467
|
-
filePath = demoCsvPath;
|
|
1468
|
-
}
|
|
1469
|
-
else {
|
|
1470
|
-
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
1878
|
+
catch (error) {
|
|
1879
|
+
return {
|
|
1880
|
+
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
1881
|
+
isError: true,
|
|
1882
|
+
};
|
|
1471
1883
|
}
|
|
1472
1884
|
}
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
1484
|
-
// Quick fix: Instantiate local detector or make global.
|
|
1485
|
-
// I'll make a global `targetDetector` constant in index.ts
|
|
1486
|
-
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
1487
|
-
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
1488
|
-
// RETRY STRATEGY:
|
|
1489
|
-
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
1490
|
-
// 2. Run `detectTarget(filePath)`.
|
|
1491
|
-
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
1492
|
-
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
1493
|
-
// But since I'm in this tool, I can't look back.
|
|
1494
|
-
// I will assume I can add it, or just do it inside the case for now.
|
|
1495
|
-
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
1496
|
-
// Let's do that in a separate step if needed.
|
|
1497
|
-
// For now, I'll instantiate it here.
|
|
1498
|
-
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
1499
|
-
const detector = new TargetDetector(__dirname);
|
|
1500
|
-
const targetResult = await detector.detectTarget(filePath);
|
|
1501
|
-
const targetInfo = targetResult.target_column ? {
|
|
1502
|
-
target: targetResult.target_column,
|
|
1503
|
-
confidence: targetResult.confidence
|
|
1504
|
-
} : undefined;
|
|
1505
|
-
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
1506
|
-
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
1507
|
-
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
1508
|
-
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
1509
|
-
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
1510
|
-
}
|
|
1511
|
-
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
1512
|
-
if (plan.operations.length === 0) {
|
|
1513
|
-
explanation += "No cleaning operations required.";
|
|
1514
|
-
}
|
|
1515
|
-
else {
|
|
1516
|
-
plan.operations.forEach((op, i) => {
|
|
1517
|
-
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
1518
|
-
});
|
|
1519
|
-
}
|
|
1520
|
-
return {
|
|
1521
|
-
content: [{ type: "text", text: explanation }]
|
|
1522
|
-
};
|
|
1523
|
-
}
|
|
1524
|
-
case "custom_clean": {
|
|
1525
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1526
|
-
const ops = request.params.arguments?.operations;
|
|
1527
|
-
if (!datasetId || datasetId === "undefined") {
|
|
1528
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1529
|
-
}
|
|
1530
|
-
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
1531
|
-
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1532
|
-
}
|
|
1533
|
-
// Pre-check: verify dataset file exists before starting the job
|
|
1534
|
-
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
1535
|
-
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
1536
|
-
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1537
|
-
const cleanSafeId = datasetId.replace(/\//g, "_");
|
|
1538
|
-
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
1539
|
-
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
1540
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
1541
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
1542
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
1543
|
-
fs.existsSync(datasetId);
|
|
1544
|
-
if (!cleanDataExists) {
|
|
1545
|
-
return {
|
|
1546
|
-
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
1547
|
-
isError: true,
|
|
1548
|
-
};
|
|
1549
|
-
}
|
|
1550
|
-
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
1551
|
-
return {
|
|
1552
|
-
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
1553
|
-
};
|
|
1554
|
-
}
|
|
1555
|
-
case "prepare_dataset": {
|
|
1556
|
-
hydrateExternalKeys();
|
|
1557
|
-
const query = String(request.params.arguments?.query);
|
|
1558
|
-
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
1559
|
-
const downloadImages = request.params.arguments?.download_images === true;
|
|
1560
|
-
if (!query || query === "undefined") {
|
|
1561
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1562
|
-
}
|
|
1563
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages });
|
|
1564
|
-
return {
|
|
1565
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1566
|
-
};
|
|
1567
|
-
}
|
|
1568
|
-
case "compare_datasets": {
|
|
1569
|
-
const datasetIds = request.params.arguments?.dataset_ids;
|
|
1570
|
-
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
1571
|
-
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
1572
|
-
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
1573
|
-
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
1574
|
-
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
1575
|
-
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
1576
|
-
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
1577
|
-
return {
|
|
1578
|
-
content: [{ type: "text", text: comparison }]
|
|
1579
|
-
};
|
|
1580
|
-
}
|
|
1581
|
-
case "check_job_status": {
|
|
1582
|
-
const jobId = String(request.params.arguments?.job_id);
|
|
1583
|
-
const job = metadataStore.getJob(jobId);
|
|
1584
|
-
if (!job) {
|
|
1585
|
-
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
1586
|
-
}
|
|
1587
|
-
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
1588
|
-
const now = Date.now();
|
|
1589
|
-
const last = jobStatusLastPoll[jobId] || 0;
|
|
1590
|
-
const minPollMs = 3000;
|
|
1591
|
-
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
1592
|
-
const waitMs = minPollMs - (now - last);
|
|
1885
|
+
case "configure_kaggle": {
|
|
1886
|
+
const username = String(request.params.arguments?.username || "").trim();
|
|
1887
|
+
const key = String(request.params.arguments?.key || "").trim();
|
|
1888
|
+
if (!username || !key) {
|
|
1889
|
+
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
1890
|
+
}
|
|
1891
|
+
const r1 = secureKeys.set("kaggle_username", username);
|
|
1892
|
+
const r2 = secureKeys.set("kaggle_key", key);
|
|
1893
|
+
process.env.KAGGLE_USERNAME = username;
|
|
1894
|
+
process.env.KAGGLE_KEY = key;
|
|
1593
1895
|
return {
|
|
1594
|
-
content: [{ type: "text", text: `
|
|
1896
|
+
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1595
1897
|
};
|
|
1596
1898
|
}
|
|
1597
|
-
|
|
1598
|
-
|
|
1599
|
-
|
|
1600
|
-
|
|
1601
|
-
|
|
1602
|
-
|
|
1603
|
-
|
|
1604
|
-
|
|
1605
|
-
|
|
1606
|
-
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
|
|
1612
|
-
if (!dataset) {
|
|
1613
|
-
throw new McpError(ErrorCode.InvalidParams, `Dataset not found: ${datasetId}`);
|
|
1614
|
-
}
|
|
1615
|
-
// Use Metadata or Registry to find the actual local file
|
|
1616
|
-
let sourcePath = undefined;
|
|
1617
|
-
const downloadStatus = metadataStore.getDownloadStatus(datasetId);
|
|
1618
|
-
if (downloadStatus && fs.existsSync(downloadStatus.local_path)) {
|
|
1619
|
-
sourcePath = downloadStatus.local_path;
|
|
1620
|
-
}
|
|
1621
|
-
else {
|
|
1622
|
-
// Fallback to local registry
|
|
1623
|
-
const reg = getRegistryEntry(datasetId);
|
|
1624
|
-
if (reg && fs.existsSync(reg.local_path)) {
|
|
1625
|
-
sourcePath = reg.local_path;
|
|
1899
|
+
case "configure_keys": {
|
|
1900
|
+
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
1901
|
+
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
1902
|
+
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
1903
|
+
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
1904
|
+
const saved = [];
|
|
1905
|
+
const methods = [];
|
|
1906
|
+
if (hfToken) {
|
|
1907
|
+
const r = secureKeys.set("hf_token", hfToken);
|
|
1908
|
+
if (r.ok) {
|
|
1909
|
+
process.env.HF_TOKEN = hfToken;
|
|
1910
|
+
saved.push("HF token");
|
|
1911
|
+
if (r.method)
|
|
1912
|
+
methods.push(r.method);
|
|
1913
|
+
}
|
|
1626
1914
|
}
|
|
1627
|
-
|
|
1628
|
-
|
|
1629
|
-
|
|
1630
|
-
|
|
1631
|
-
|
|
1632
|
-
|
|
1915
|
+
if (kaggleUsername) {
|
|
1916
|
+
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
1917
|
+
if (r.ok) {
|
|
1918
|
+
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1919
|
+
saved.push("Kaggle username");
|
|
1920
|
+
if (r.method)
|
|
1921
|
+
methods.push(r.method);
|
|
1922
|
+
}
|
|
1633
1923
|
}
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
let waited = 0;
|
|
1642
|
-
while (waited < maxWait) {
|
|
1643
|
-
const ds = metadataStore.getDownloadStatus(datasetId);
|
|
1644
|
-
if (ds && ds.local_path && fs.existsSync(ds.local_path)) {
|
|
1645
|
-
sourcePath = ds.local_path;
|
|
1646
|
-
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
1647
|
-
break;
|
|
1924
|
+
if (kaggleKey) {
|
|
1925
|
+
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
1926
|
+
if (r.ok) {
|
|
1927
|
+
process.env.KAGGLE_KEY = kaggleKey;
|
|
1928
|
+
saved.push("Kaggle key");
|
|
1929
|
+
if (r.method)
|
|
1930
|
+
methods.push(r.method);
|
|
1648
1931
|
}
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1932
|
+
}
|
|
1933
|
+
if (dataworldToken) {
|
|
1934
|
+
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
1935
|
+
if (r.ok) {
|
|
1936
|
+
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
1937
|
+
saved.push("data.world token");
|
|
1938
|
+
if (r.method)
|
|
1939
|
+
methods.push(r.method);
|
|
1655
1940
|
}
|
|
1656
|
-
await wait(interval);
|
|
1657
|
-
waited += interval;
|
|
1658
1941
|
}
|
|
1659
|
-
|
|
1660
|
-
if (!sourcePath) {
|
|
1661
|
-
const entries = readRegistry();
|
|
1662
|
-
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
1942
|
+
if (saved.length === 0) {
|
|
1663
1943
|
return {
|
|
1664
|
-
content: [{ type: "text", text:
|
|
1665
|
-
isError: true
|
|
1944
|
+
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
1666
1945
|
};
|
|
1667
1946
|
}
|
|
1947
|
+
return {
|
|
1948
|
+
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
1949
|
+
};
|
|
1668
1950
|
}
|
|
1669
|
-
|
|
1670
|
-
|
|
1671
|
-
|
|
1672
|
-
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1676
|
-
|
|
1677
|
-
|
|
1678
|
-
|
|
1951
|
+
case "get_dataset_info": {
|
|
1952
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1953
|
+
if (!datasetId) {
|
|
1954
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1955
|
+
}
|
|
1956
|
+
const dataset = metadataStore.getDataset(datasetId);
|
|
1957
|
+
if (!dataset) {
|
|
1958
|
+
// Fallback: check the registry for local path info
|
|
1959
|
+
const regEntry = getRegistryEntry(datasetId);
|
|
1960
|
+
const regPath = regEntry?.local_path || regEntry?.path;
|
|
1961
|
+
if (regEntry) {
|
|
1962
|
+
const exists = regPath && fs.existsSync(regPath);
|
|
1963
|
+
return {
|
|
1964
|
+
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
1965
|
+
};
|
|
1966
|
+
}
|
|
1967
|
+
return {
|
|
1968
|
+
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
1969
|
+
isError: true,
|
|
1970
|
+
};
|
|
1971
|
+
}
|
|
1972
|
+
// Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
|
|
1973
|
+
if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
|
|
1679
1974
|
try {
|
|
1680
|
-
const
|
|
1681
|
-
if (
|
|
1682
|
-
|
|
1683
|
-
|
|
1684
|
-
|
|
1685
|
-
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
|
|
1975
|
+
const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
|
|
1976
|
+
if (sizeResp.ok) {
|
|
1977
|
+
const sizeData = await sizeResp.json();
|
|
1978
|
+
const numRows = sizeData?.size?.dataset?.num_rows;
|
|
1979
|
+
if (numRows && numRows > 0) {
|
|
1980
|
+
dataset.total_examples = numRows;
|
|
1981
|
+
// Also backfill splits
|
|
1982
|
+
if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
|
|
1983
|
+
dataset.splits = sizeData.size.splits.map((s) => ({
|
|
1984
|
+
name: s.split,
|
|
1985
|
+
num_examples: s.num_rows || 0,
|
|
1986
|
+
size_bytes: s.num_bytes_parquet_files || 0,
|
|
1987
|
+
}));
|
|
1988
|
+
dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
|
|
1989
|
+
dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
|
|
1990
|
+
dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
|
|
1991
|
+
}
|
|
1992
|
+
// Persist enriched metadata
|
|
1993
|
+
metadataStore.saveDataset(dataset);
|
|
1689
1994
|
}
|
|
1690
1995
|
}
|
|
1691
1996
|
}
|
|
1692
|
-
catch
|
|
1693
|
-
|
|
1997
|
+
catch {
|
|
1998
|
+
// Enrichment is best-effort; continue with whatever we have
|
|
1694
1999
|
}
|
|
1695
2000
|
}
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
|
|
1701
|
-
|
|
1702
|
-
|
|
1703
|
-
|
|
1704
|
-
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
|
|
1715
|
-
|
|
1716
|
-
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
1724
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
1725
|
-
if (result.file_size_mb !== undefined)
|
|
1726
|
-
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
1727
|
-
if (result.elapsed_seconds !== undefined)
|
|
1728
|
-
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
1729
|
-
if (result.preview_path)
|
|
1730
|
-
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
1731
|
-
msg += `\n`;
|
|
1732
|
-
if (requestedFormat === "feather") {
|
|
1733
|
-
msg += `**Inspect with:**\n`;
|
|
1734
|
-
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
1735
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1736
|
-
}
|
|
1737
|
-
else if (requestedFormat === "parquet") {
|
|
1738
|
-
msg += `**Inspect with:**\n`;
|
|
1739
|
-
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
1740
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
1741
|
-
}
|
|
1742
|
-
return { content: [{ type: "text", text: msg }] };
|
|
1743
|
-
}
|
|
1744
|
-
catch (error) {
|
|
2001
|
+
const formattedOutput = formatDatasetInfo(dataset);
|
|
2002
|
+
return { content: [{ type: "text", text: formattedOutput }] };
|
|
2003
|
+
}
|
|
2004
|
+
case "analyze_quality": {
|
|
2005
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2006
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2007
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2008
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2009
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2010
|
+
// Demo Fallback for easy testing
|
|
2011
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2012
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2013
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2014
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
2015
|
+
filePath = demoParquetPath;
|
|
2016
|
+
}
|
|
2017
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
2018
|
+
filePath = demoCsvPath;
|
|
2019
|
+
}
|
|
2020
|
+
else if (datasetId !== "demo") {
|
|
2021
|
+
return {
|
|
2022
|
+
content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
|
|
2023
|
+
isError: true
|
|
2024
|
+
};
|
|
2025
|
+
}
|
|
2026
|
+
}
|
|
2027
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
1745
2028
|
return {
|
|
1746
|
-
content: [{ type: "text", text:
|
|
1747
|
-
isError: true
|
|
2029
|
+
content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
|
|
1748
2030
|
};
|
|
1749
2031
|
}
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
1756
|
-
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
1763
|
-
|
|
1764
|
-
|
|
1765
|
-
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
if (fs.existsSync(src)) {
|
|
1769
|
-
resolvedPaths.push(src);
|
|
1770
|
-
continue;
|
|
2032
|
+
case "preview_cleaning": {
|
|
2033
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2034
|
+
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
2035
|
+
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
2036
|
+
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
2037
|
+
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
2038
|
+
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
2039
|
+
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
2040
|
+
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
2041
|
+
if (fs.existsSync(demoParquetPath)) {
|
|
2042
|
+
filePath = demoParquetPath;
|
|
2043
|
+
}
|
|
2044
|
+
else if (fs.existsSync(demoCsvPath)) {
|
|
2045
|
+
filePath = demoCsvPath;
|
|
2046
|
+
}
|
|
2047
|
+
else {
|
|
2048
|
+
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
2049
|
+
}
|
|
1771
2050
|
}
|
|
1772
|
-
const
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
|
|
2051
|
+
const report = await qualityAnalyzer.analyze(filePath);
|
|
2052
|
+
// Phase 1: Target Detection
|
|
2053
|
+
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
2054
|
+
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
2055
|
+
// OR let the planner handle it if we update its signature to accept filePath.
|
|
2056
|
+
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
2057
|
+
// We updated it to accept `targetInfo`.
|
|
2058
|
+
// So we need to run detection HERE and pass it.
|
|
2059
|
+
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
2060
|
+
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
2061
|
+
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
2062
|
+
// Quick fix: Instantiate local detector or make global.
|
|
2063
|
+
// I'll make a global `targetDetector` constant in index.ts
|
|
2064
|
+
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
2065
|
+
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
2066
|
+
// RETRY STRATEGY:
|
|
2067
|
+
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
2068
|
+
// 2. Run `detectTarget(filePath)`.
|
|
2069
|
+
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
2070
|
+
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
2071
|
+
// But since I'm in this tool, I can't look back.
|
|
2072
|
+
// I will assume I can add it, or just do it inside the case for now.
|
|
2073
|
+
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
2074
|
+
// Let's do that in a separate step if needed.
|
|
2075
|
+
// For now, I'll instantiate it here.
|
|
2076
|
+
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
2077
|
+
const detector = new TargetDetector(__dirname);
|
|
2078
|
+
const targetResult = await detector.detectTarget(filePath);
|
|
2079
|
+
const targetInfo = targetResult.target_column ? {
|
|
2080
|
+
target: targetResult.target_column,
|
|
2081
|
+
confidence: targetResult.confidence
|
|
2082
|
+
} : undefined;
|
|
2083
|
+
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
2084
|
+
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
2085
|
+
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
2086
|
+
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
2087
|
+
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
2088
|
+
}
|
|
2089
|
+
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
2090
|
+
if (plan.operations.length === 0) {
|
|
2091
|
+
explanation += "No cleaning operations required.";
|
|
2092
|
+
}
|
|
2093
|
+
else {
|
|
2094
|
+
plan.operations.forEach((op, i) => {
|
|
2095
|
+
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
2096
|
+
});
|
|
1776
2097
|
}
|
|
1777
|
-
unresolved.push(src);
|
|
1778
|
-
}
|
|
1779
|
-
if (unresolved.length > 0) {
|
|
1780
2098
|
return {
|
|
1781
|
-
content: [{
|
|
1782
|
-
type: "text",
|
|
1783
|
-
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
1784
|
-
}],
|
|
1785
|
-
isError: true
|
|
2099
|
+
content: [{ type: "text", text: explanation }]
|
|
1786
2100
|
};
|
|
1787
2101
|
}
|
|
1788
|
-
|
|
1789
|
-
const
|
|
1790
|
-
const
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
1794
|
-
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
1795
|
-
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
1796
|
-
strategy,
|
|
1797
|
-
join_on: joinOn,
|
|
1798
|
-
how,
|
|
1799
|
-
dedup,
|
|
1800
|
-
run_quality_after: runQualityAfter,
|
|
1801
|
-
leakage_check: leakageCheck,
|
|
1802
|
-
output_format: outputFormat,
|
|
1803
|
-
compression: compression,
|
|
1804
|
-
preview,
|
|
1805
|
-
});
|
|
1806
|
-
const nullDelta = result.stats.null_delta;
|
|
1807
|
-
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
1808
|
-
// Register fused dataset under a generated id so users can export it easily
|
|
1809
|
-
const fusedId = `fused_${Date.now()}`;
|
|
1810
|
-
try {
|
|
1811
|
-
upsertRegistry(fusedId, result.output_path, "completed");
|
|
2102
|
+
case "custom_clean": {
|
|
2103
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2104
|
+
const ops = request.params.arguments?.operations;
|
|
2105
|
+
if (!datasetId || datasetId === "undefined") {
|
|
2106
|
+
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1812
2107
|
}
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
}
|
|
1816
|
-
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
1817
|
-
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
1818
|
-
msg += `- Null change: ${nullText}\n`;
|
|
1819
|
-
msg += `- Output: ${result.output_path}\n`;
|
|
1820
|
-
if (result.preview_path)
|
|
1821
|
-
msg += `- Preview: ${result.preview_path}\n`;
|
|
1822
|
-
if (result.leakage_report) {
|
|
1823
|
-
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
1824
|
-
if (result.leakage_report.leakage_count) {
|
|
1825
|
-
msg += ` (${result.leakage_report.leakage_count})`;
|
|
1826
|
-
}
|
|
1827
|
-
msg += "\n";
|
|
2108
|
+
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
2109
|
+
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
1828
2110
|
}
|
|
1829
|
-
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
2111
|
+
// Pre-check: verify dataset file exists before starting the job
|
|
2112
|
+
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
2113
|
+
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
2114
|
+
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
2115
|
+
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
2116
|
+
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
2117
|
+
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
2118
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
2119
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
2120
|
+
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
2121
|
+
fs.existsSync(datasetId);
|
|
2122
|
+
if (!cleanDataExists) {
|
|
2123
|
+
return {
|
|
2124
|
+
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
2125
|
+
isError: true,
|
|
2126
|
+
};
|
|
2127
|
+
}
|
|
2128
|
+
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
1833
2129
|
return {
|
|
1834
|
-
content: [{ type: "text", text: `
|
|
1835
|
-
isError: true
|
|
2130
|
+
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
1836
2131
|
};
|
|
1837
2132
|
}
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
|
|
1850
|
-
|
|
1851
|
-
if (report.individual_results.length > 0) {
|
|
1852
|
-
output += `### Sample Detail (Top 5)\n`;
|
|
1853
|
-
report.individual_results.slice(0, 5).forEach(img => {
|
|
1854
|
-
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
1855
|
-
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
1856
|
-
});
|
|
2133
|
+
case "prepare_dataset": {
|
|
2134
|
+
hydrateExternalKeys();
|
|
2135
|
+
const query = String(request.params.arguments?.query);
|
|
2136
|
+
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
2137
|
+
const downloadImages = request.params.arguments?.download_images === true;
|
|
2138
|
+
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2139
|
+
? String(request.params.arguments.target_dir).trim()
|
|
2140
|
+
: request.params.arguments?.output_dir
|
|
2141
|
+
? String(request.params.arguments.output_dir).trim()
|
|
2142
|
+
: "";
|
|
2143
|
+
const outputDir = requestedOutputDir || process.cwd();
|
|
2144
|
+
if (!query || query === "undefined") {
|
|
2145
|
+
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
1857
2146
|
}
|
|
2147
|
+
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
1858
2148
|
return {
|
|
1859
|
-
content: [{ type: "text", text:
|
|
2149
|
+
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
1860
2150
|
};
|
|
1861
2151
|
}
|
|
1862
|
-
|
|
2152
|
+
case "compare_datasets": {
|
|
2153
|
+
const datasetIds = request.params.arguments?.dataset_ids;
|
|
2154
|
+
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
2155
|
+
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
2156
|
+
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
2157
|
+
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
2158
|
+
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
2159
|
+
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
2160
|
+
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
1863
2161
|
return {
|
|
1864
|
-
content: [{ type: "text", text:
|
|
1865
|
-
isError: true
|
|
2162
|
+
content: [{ type: "text", text: comparison }]
|
|
1866
2163
|
};
|
|
1867
2164
|
}
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
2165
|
+
case "check_job_status": {
|
|
2166
|
+
const jobId = String(request.params.arguments?.job_id);
|
|
2167
|
+
const job = metadataStore.getJob(jobId);
|
|
2168
|
+
if (!job) {
|
|
2169
|
+
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
2170
|
+
}
|
|
2171
|
+
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
2172
|
+
const now = Date.now();
|
|
2173
|
+
const last = jobStatusLastPoll[jobId] || 0;
|
|
2174
|
+
const minPollMs = 3000;
|
|
2175
|
+
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
2176
|
+
const waitMs = minPollMs - (now - last);
|
|
2177
|
+
return {
|
|
2178
|
+
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
2179
|
+
};
|
|
2180
|
+
}
|
|
2181
|
+
jobStatusLastPoll[jobId] = now;
|
|
2182
|
+
return {
|
|
2183
|
+
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
2184
|
+
};
|
|
1873
2185
|
}
|
|
1874
|
-
|
|
1875
|
-
const
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
2186
|
+
case "export_dataset": {
|
|
2187
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2188
|
+
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
2189
|
+
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2190
|
+
? String(request.params.arguments?.target_dir).trim()
|
|
2191
|
+
: request.params.arguments?.output_dir
|
|
2192
|
+
? String(request.params.arguments?.output_dir).trim()
|
|
2193
|
+
: "";
|
|
2194
|
+
const targetDir = path.resolve(requestedTargetDir || process.cwd());
|
|
2195
|
+
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2196
|
+
const fastMode = request.params.arguments?.fast === true;
|
|
2197
|
+
const preview = request.params.arguments?.preview === true;
|
|
2198
|
+
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
2199
|
+
const columns = request.params.arguments?.columns;
|
|
2200
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2201
|
+
// Use Metadata or Registry to find the actual local file
|
|
2202
|
+
const preferredLookupDirs = [targetDir, process.cwd()];
|
|
2203
|
+
let sourcePath = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2204
|
+
if (!sourcePath) {
|
|
2205
|
+
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2206
|
+
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2207
|
+
try {
|
|
2208
|
+
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
1892
2209
|
}
|
|
1893
|
-
|
|
1894
|
-
|
|
2210
|
+
catch (e) {
|
|
2211
|
+
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
1895
2212
|
}
|
|
1896
|
-
|
|
1897
|
-
|
|
2213
|
+
// Poll for download status or registry entry until local_path appears or timeout
|
|
2214
|
+
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
2215
|
+
const maxWait = 120_000; // 120s
|
|
2216
|
+
const interval = 2000;
|
|
2217
|
+
let waited = 0;
|
|
2218
|
+
while (waited < maxWait) {
|
|
2219
|
+
const resolved = resolveDatasetLocalPath(datasetId, preferredLookupDirs);
|
|
2220
|
+
if (resolved) {
|
|
2221
|
+
sourcePath = resolved;
|
|
2222
|
+
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
2223
|
+
break;
|
|
2224
|
+
}
|
|
2225
|
+
await wait(interval);
|
|
2226
|
+
waited += interval;
|
|
2227
|
+
}
|
|
2228
|
+
// If still no sourcePath, return helpful error listing prepared datasets
|
|
2229
|
+
if (!sourcePath) {
|
|
2230
|
+
const entries = readRegistry();
|
|
2231
|
+
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
2232
|
+
return {
|
|
2233
|
+
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
2234
|
+
isError: true
|
|
2235
|
+
};
|
|
2236
|
+
}
|
|
2237
|
+
}
|
|
2238
|
+
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
2239
|
+
try {
|
|
2240
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
2241
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
1898
2242
|
}
|
|
2243
|
+
}
|
|
2244
|
+
catch (e) {
|
|
2245
|
+
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
2246
|
+
}
|
|
2247
|
+
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
2248
|
+
if (!fastMode) {
|
|
2249
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
2250
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
2251
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
2252
|
+
if (!pipelineCompatibleInput) {
|
|
2253
|
+
console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
|
|
2254
|
+
}
|
|
2255
|
+
else if (currentExt !== pipelineFmt) {
|
|
2256
|
+
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
2257
|
+
try {
|
|
2258
|
+
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, targetDir);
|
|
2259
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2260
|
+
if (pipelineResult.final_output_path) {
|
|
2261
|
+
sourcePath = pipelineResult.final_output_path;
|
|
2262
|
+
try {
|
|
2263
|
+
// Update registry to point to pipeline's final output
|
|
2264
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
2265
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2266
|
+
}
|
|
2267
|
+
}
|
|
2268
|
+
catch (e) {
|
|
2269
|
+
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
2270
|
+
}
|
|
2271
|
+
}
|
|
2272
|
+
}
|
|
2273
|
+
catch (err) {
|
|
2274
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
2275
|
+
}
|
|
2276
|
+
}
|
|
2277
|
+
}
|
|
2278
|
+
else {
|
|
2279
|
+
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
2280
|
+
}
|
|
2281
|
+
// Build export options
|
|
2282
|
+
const exportOpts = {};
|
|
2283
|
+
if (compression)
|
|
2284
|
+
exportOpts.compression = compression;
|
|
2285
|
+
if (preview)
|
|
2286
|
+
exportOpts.preview = true;
|
|
2287
|
+
if (sampleRows)
|
|
2288
|
+
exportOpts.sample_rows = sampleRows;
|
|
2289
|
+
if (columns)
|
|
2290
|
+
exportOpts.columns = columns;
|
|
2291
|
+
try {
|
|
2292
|
+
// Determine output file name
|
|
2293
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2294
|
+
const ext = extMap[requestedFormat] || ".feather";
|
|
2295
|
+
const safeName = getExportFileStem(datasetId);
|
|
2296
|
+
const outDir = targetDir;
|
|
2297
|
+
if (!fs.existsSync(outDir))
|
|
2298
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2299
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2300
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2301
|
+
// Build rich response
|
|
2302
|
+
let msg = `**Export complete**\n`;
|
|
2303
|
+
msg += `- **File**: ${result.output_path}\n`;
|
|
2304
|
+
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
2305
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2306
|
+
if (result.file_size_mb !== undefined)
|
|
2307
|
+
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
2308
|
+
if (result.elapsed_seconds !== undefined)
|
|
2309
|
+
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
2310
|
+
if (result.preview_path)
|
|
2311
|
+
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
2312
|
+
msg += `\n`;
|
|
2313
|
+
if (requestedFormat === "feather") {
|
|
2314
|
+
msg += `**Inspect with:**\n`;
|
|
2315
|
+
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
2316
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2317
|
+
}
|
|
2318
|
+
else if (requestedFormat === "parquet") {
|
|
2319
|
+
msg += `**Inspect with:**\n`;
|
|
2320
|
+
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
2321
|
+
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2322
|
+
}
|
|
2323
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2324
|
+
}
|
|
2325
|
+
catch (error) {
|
|
2326
|
+
return {
|
|
2327
|
+
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
2328
|
+
isError: true
|
|
2329
|
+
};
|
|
2330
|
+
}
|
|
2331
|
+
}
|
|
2332
|
+
case "vesper_list_datasets": {
|
|
2333
|
+
const entries = readRegistry();
|
|
2334
|
+
if (entries.length === 0) {
|
|
2335
|
+
return {
|
|
2336
|
+
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
2337
|
+
};
|
|
2338
|
+
}
|
|
2339
|
+
const lines = entries.map((e, i) => {
|
|
2340
|
+
const id = e.dataset_id || e.id || "unknown";
|
|
2341
|
+
const localPath = e.local_path || e.path || "unknown";
|
|
2342
|
+
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
2343
|
+
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
1899
2344
|
});
|
|
1900
2345
|
return {
|
|
1901
|
-
content: [{ type: "text", text:
|
|
2346
|
+
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
1902
2347
|
};
|
|
1903
2348
|
}
|
|
1904
|
-
|
|
1905
|
-
|
|
1906
|
-
|
|
1907
|
-
|
|
1908
|
-
|
|
2349
|
+
case "vesper_convert_format": {
|
|
2350
|
+
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2351
|
+
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
2352
|
+
if (!filePath) {
|
|
2353
|
+
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
2354
|
+
}
|
|
2355
|
+
if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
|
|
2356
|
+
throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
|
|
2357
|
+
}
|
|
2358
|
+
if (!fs.existsSync(filePath)) {
|
|
2359
|
+
return {
|
|
2360
|
+
content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
|
|
2361
|
+
isError: true,
|
|
2362
|
+
};
|
|
2363
|
+
}
|
|
2364
|
+
const inputExt = path.extname(filePath).toLowerCase();
|
|
2365
|
+
const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
|
|
2366
|
+
const outputExt = extMap[targetFormat];
|
|
2367
|
+
if (inputExt === outputExt) {
|
|
2368
|
+
return {
|
|
2369
|
+
content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
|
|
2370
|
+
};
|
|
2371
|
+
}
|
|
2372
|
+
const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
|
|
2373
|
+
try {
|
|
2374
|
+
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2375
|
+
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
2376
|
+
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
2377
|
+
if (!result.ok) {
|
|
2378
|
+
return {
|
|
2379
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
2380
|
+
isError: true,
|
|
2381
|
+
};
|
|
2382
|
+
}
|
|
2383
|
+
// Register converted file in the registry
|
|
2384
|
+
const datasetId = path.basename(outputPath, outputExt);
|
|
2385
|
+
try {
|
|
2386
|
+
upsertRegistry(datasetId, outputPath, "completed");
|
|
2387
|
+
}
|
|
2388
|
+
catch (e) {
|
|
2389
|
+
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2390
|
+
}
|
|
2391
|
+
let msg = `**Conversion complete**\n`;
|
|
2392
|
+
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2393
|
+
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
2394
|
+
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2395
|
+
if (result.size_mb !== undefined)
|
|
2396
|
+
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
2397
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2398
|
+
}
|
|
2399
|
+
catch (error) {
|
|
2400
|
+
return {
|
|
2401
|
+
content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
|
|
2402
|
+
isError: true,
|
|
2403
|
+
};
|
|
2404
|
+
}
|
|
1909
2405
|
}
|
|
1910
|
-
|
|
1911
|
-
|
|
1912
|
-
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
2406
|
+
case "fuse_datasets": {
|
|
2407
|
+
const rawSources = request.params.arguments?.sources;
|
|
2408
|
+
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
2409
|
+
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
2410
|
+
}
|
|
2411
|
+
const strategy = request.params.arguments?.strategy || "concat";
|
|
2412
|
+
const joinOn = request.params.arguments?.join_on;
|
|
2413
|
+
const how = request.params.arguments?.how || "inner";
|
|
2414
|
+
const dedup = request.params.arguments?.dedup !== false;
|
|
2415
|
+
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
2416
|
+
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
2417
|
+
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
2418
|
+
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2419
|
+
const preview = request.params.arguments?.preview !== false;
|
|
2420
|
+
const resolvedPaths = [];
|
|
2421
|
+
const unresolved = [];
|
|
2422
|
+
for (const src of rawSources) {
|
|
2423
|
+
if (fs.existsSync(src)) {
|
|
2424
|
+
resolvedPaths.push(src);
|
|
2425
|
+
continue;
|
|
2426
|
+
}
|
|
2427
|
+
const status = metadataStore.getDownloadStatus(src);
|
|
2428
|
+
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
2429
|
+
resolvedPaths.push(status.local_path);
|
|
2430
|
+
continue;
|
|
2431
|
+
}
|
|
2432
|
+
unresolved.push(src);
|
|
2433
|
+
}
|
|
2434
|
+
if (unresolved.length > 0) {
|
|
2435
|
+
return {
|
|
2436
|
+
content: [{
|
|
2437
|
+
type: "text",
|
|
2438
|
+
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
2439
|
+
}],
|
|
2440
|
+
isError: true
|
|
2441
|
+
};
|
|
2442
|
+
}
|
|
2443
|
+
try {
|
|
2444
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
2445
|
+
const ext = extMap[outputFormat] || ".feather";
|
|
2446
|
+
const outDir = process.cwd();
|
|
2447
|
+
if (!fs.existsSync(outDir))
|
|
2448
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
2449
|
+
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2450
|
+
console.error(`[Fusion] Resolved output directory: ${outDir}`);
|
|
2451
|
+
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2452
|
+
strategy,
|
|
2453
|
+
join_on: joinOn,
|
|
2454
|
+
how,
|
|
2455
|
+
dedup,
|
|
2456
|
+
run_quality_after: runQualityAfter,
|
|
2457
|
+
leakage_check: leakageCheck,
|
|
2458
|
+
output_format: outputFormat,
|
|
2459
|
+
compression: compression,
|
|
2460
|
+
preview,
|
|
2461
|
+
});
|
|
2462
|
+
const nullDelta = result.stats.null_delta;
|
|
2463
|
+
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
2464
|
+
// Register fused dataset under a generated id so users can export it easily
|
|
2465
|
+
const fusedId = `fused_${Date.now()}`;
|
|
2466
|
+
try {
|
|
2467
|
+
upsertRegistry(fusedId, result.output_path, "completed");
|
|
2468
|
+
}
|
|
2469
|
+
catch (e) {
|
|
2470
|
+
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
2471
|
+
}
|
|
2472
|
+
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
2473
|
+
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
2474
|
+
msg += `- Null change: ${nullText}\n`;
|
|
2475
|
+
msg += `- Output: ${result.output_path}\n`;
|
|
2476
|
+
if (result.preview_path)
|
|
2477
|
+
msg += `- Preview: ${result.preview_path}\n`;
|
|
2478
|
+
if (result.leakage_report) {
|
|
2479
|
+
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
2480
|
+
if (result.leakage_report.leakage_count) {
|
|
2481
|
+
msg += ` (${result.leakage_report.leakage_count})`;
|
|
2482
|
+
}
|
|
2483
|
+
msg += "\n";
|
|
2484
|
+
}
|
|
2485
|
+
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
2486
|
+
return { content: [{ type: "text", text: msg }] };
|
|
2487
|
+
}
|
|
2488
|
+
catch (error) {
|
|
2489
|
+
return {
|
|
2490
|
+
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
2491
|
+
isError: true
|
|
2492
|
+
};
|
|
2493
|
+
}
|
|
1916
2494
|
}
|
|
1917
|
-
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1939
|
-
|
|
1940
|
-
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
}
|
|
1946
|
-
if (report.audio_quality) {
|
|
1947
|
-
output += `## Audio Quality\n`;
|
|
1948
|
-
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
1949
|
-
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
1950
|
-
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
1951
|
-
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
1952
|
-
}
|
|
1953
|
-
if (report.video_quality) {
|
|
1954
|
-
output += `## Video Quality\n`;
|
|
1955
|
-
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
1956
|
-
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
1957
|
-
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
1958
|
-
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
1959
|
-
}
|
|
1960
|
-
output += `## Recommendations\n`;
|
|
1961
|
-
report.recommendations.forEach(rec => {
|
|
1962
|
-
output += `- ${rec}\n`;
|
|
1963
|
-
});
|
|
1964
|
-
return {
|
|
1965
|
-
content: [{ type: "text", text: output }]
|
|
1966
|
-
};
|
|
2495
|
+
case "analyze_image_quality": {
|
|
2496
|
+
const inputPath = String(request.params.arguments?.path);
|
|
2497
|
+
if (!fs.existsSync(inputPath)) {
|
|
2498
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2499
|
+
}
|
|
2500
|
+
try {
|
|
2501
|
+
const report = await imageAnalyzer.analyze(inputPath);
|
|
2502
|
+
let output = `## Image Quality Report\n\n`;
|
|
2503
|
+
output += `- **Total Images**: ${report.total_images}\n`;
|
|
2504
|
+
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
2505
|
+
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
2506
|
+
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
2507
|
+
if (report.individual_results.length > 0) {
|
|
2508
|
+
output += `### Sample Detail (Top 5)\n`;
|
|
2509
|
+
report.individual_results.slice(0, 5).forEach(img => {
|
|
2510
|
+
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
2511
|
+
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
2512
|
+
});
|
|
2513
|
+
}
|
|
2514
|
+
return {
|
|
2515
|
+
content: [{ type: "text", text: output }]
|
|
2516
|
+
};
|
|
2517
|
+
}
|
|
2518
|
+
catch (error) {
|
|
2519
|
+
return {
|
|
2520
|
+
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
2521
|
+
isError: true
|
|
2522
|
+
};
|
|
2523
|
+
}
|
|
1967
2524
|
}
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
}
|
|
2525
|
+
case "analyze_media_quality": {
|
|
2526
|
+
const inputPath = String(request.params.arguments?.path);
|
|
2527
|
+
if (!fs.existsSync(inputPath)) {
|
|
2528
|
+
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2529
|
+
}
|
|
2530
|
+
try {
|
|
2531
|
+
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2532
|
+
let output = `## Media Quality Report\n\n`;
|
|
2533
|
+
output += `- **Total Files**: ${report.total_files}\n`;
|
|
2534
|
+
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
2535
|
+
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
2536
|
+
if ('avg_audio_duration' in report && report.avg_audio_duration) {
|
|
2537
|
+
output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
|
|
2538
|
+
}
|
|
2539
|
+
if ('avg_video_duration' in report && report.avg_video_duration) {
|
|
2540
|
+
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
2541
|
+
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
2542
|
+
}
|
|
2543
|
+
output += `\n### Sample Detail (Top 5)\n`;
|
|
2544
|
+
report.details.slice(0, 5).forEach(item => {
|
|
2545
|
+
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
2546
|
+
if (item.type === "audio" && 'sample_rate' in item) {
|
|
2547
|
+
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
2548
|
+
}
|
|
2549
|
+
else if (item.type === "video" && 'width' in item) {
|
|
2550
|
+
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
2551
|
+
}
|
|
2552
|
+
else {
|
|
2553
|
+
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
2554
|
+
}
|
|
2555
|
+
});
|
|
2556
|
+
return {
|
|
2557
|
+
content: [{ type: "text", text: output }]
|
|
2558
|
+
};
|
|
2559
|
+
}
|
|
2560
|
+
catch (error) {
|
|
2561
|
+
return {
|
|
2562
|
+
content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
|
|
2563
|
+
isError: true
|
|
2564
|
+
};
|
|
2565
|
+
}
|
|
2566
|
+
}
|
|
2567
|
+
case "generate_quality_report": {
|
|
2568
|
+
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2569
|
+
const datasetPath = String(request.params.arguments?.dataset_path);
|
|
2570
|
+
if (!fs.existsSync(datasetPath)) {
|
|
2571
|
+
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
2572
|
+
}
|
|
2573
|
+
try {
|
|
2574
|
+
// Optionally load text quality from metadata if available
|
|
2575
|
+
const metadata = await metadataStore.getDataset(datasetId);
|
|
2576
|
+
// TODO: Integrate text quality analysis when available
|
|
2577
|
+
const textQuality = null;
|
|
2578
|
+
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
2579
|
+
// Save report to metadata
|
|
2580
|
+
if (metadata) {
|
|
2581
|
+
metadata.unified_quality_report = report;
|
|
2582
|
+
await metadataStore.saveDataset(metadata);
|
|
2583
|
+
}
|
|
2584
|
+
let output = `# Unified Quality Report\n\n`;
|
|
2585
|
+
output += `**Dataset**: ${datasetId}\n`;
|
|
2586
|
+
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
2587
|
+
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
2588
|
+
if (report.text_quality) {
|
|
2589
|
+
output += `## Text Quality\n`;
|
|
2590
|
+
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
2591
|
+
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
2592
|
+
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
2593
|
+
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
2594
|
+
}
|
|
2595
|
+
if (report.image_quality) {
|
|
2596
|
+
output += `## Image Quality\n`;
|
|
2597
|
+
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
2598
|
+
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
2599
|
+
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
2600
|
+
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
2601
|
+
}
|
|
2602
|
+
if (report.audio_quality) {
|
|
2603
|
+
output += `## Audio Quality\n`;
|
|
2604
|
+
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
2605
|
+
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
2606
|
+
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
2607
|
+
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
2608
|
+
}
|
|
2609
|
+
if (report.video_quality) {
|
|
2610
|
+
output += `## Video Quality\n`;
|
|
2611
|
+
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
2612
|
+
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
2613
|
+
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
2614
|
+
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
2615
|
+
}
|
|
2616
|
+
output += `## Recommendations\n`;
|
|
2617
|
+
report.recommendations.forEach(rec => {
|
|
2618
|
+
output += `- ${rec}\n`;
|
|
2619
|
+
});
|
|
2620
|
+
return {
|
|
2621
|
+
content: [{ type: "text", text: output }]
|
|
2622
|
+
};
|
|
2623
|
+
}
|
|
2624
|
+
catch (error) {
|
|
2625
|
+
return {
|
|
2626
|
+
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
2627
|
+
isError: true
|
|
2628
|
+
};
|
|
2629
|
+
}
|
|
1973
2630
|
}
|
|
2631
|
+
default:
|
|
2632
|
+
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
1974
2633
|
}
|
|
1975
|
-
|
|
1976
|
-
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
1977
|
-
}
|
|
2634
|
+
}); // end requestQueue.enqueue
|
|
1978
2635
|
});
|
|
1979
2636
|
async function main() {
|
|
1980
2637
|
const args = process.argv.slice(2);
|
|
@@ -1982,6 +2639,7 @@ async function main() {
|
|
|
1982
2639
|
const isFuse = args.includes("fuse");
|
|
1983
2640
|
const isDiscover = args.includes("discover");
|
|
1984
2641
|
const isDownload = args.includes("download");
|
|
2642
|
+
const isExport = args.includes("export");
|
|
1985
2643
|
const isConfig = args.includes("config") || args.includes("configure");
|
|
1986
2644
|
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
1987
2645
|
const isSilent = args.includes("--silent");
|
|
@@ -2004,6 +2662,10 @@ async function main() {
|
|
|
2004
2662
|
await runDownloadCli(args);
|
|
2005
2663
|
return;
|
|
2006
2664
|
}
|
|
2665
|
+
if (isExport) {
|
|
2666
|
+
await runExportCli(args);
|
|
2667
|
+
return;
|
|
2668
|
+
}
|
|
2007
2669
|
// If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
|
|
2008
2670
|
if (isSetup) {
|
|
2009
2671
|
await runSetupWizard(isSilent);
|
|
@@ -2276,6 +2938,99 @@ async function runDownloadCli(args) {
|
|
|
2276
2938
|
}
|
|
2277
2939
|
console.log(`Download complete: ${localPath}`);
|
|
2278
2940
|
}
|
|
2941
|
+
async function runExportCli(args) {
|
|
2942
|
+
const getArgValue = (name) => {
|
|
2943
|
+
const idx = args.findIndex(a => a === name);
|
|
2944
|
+
if (idx >= 0 && idx + 1 < args.length)
|
|
2945
|
+
return args[idx + 1];
|
|
2946
|
+
return undefined;
|
|
2947
|
+
};
|
|
2948
|
+
const nonFlags = args.filter((arg, index) => {
|
|
2949
|
+
if (arg.startsWith("--"))
|
|
2950
|
+
return false;
|
|
2951
|
+
const previous = index > 0 ? args[index - 1] : "";
|
|
2952
|
+
if (["--target-dir", "--format", "--compression", "--sample-rows", "--columns"].includes(previous))
|
|
2953
|
+
return false;
|
|
2954
|
+
return true;
|
|
2955
|
+
});
|
|
2956
|
+
const datasetId = nonFlags[1] || "";
|
|
2957
|
+
if (!datasetId) {
|
|
2958
|
+
console.error("Usage: vespermcp export <dataset-id|local-path> [--format parquet|feather|csv|jsonl|arrow] [--target-dir C:/path] [--compression snappy] [--fast] [--preview] [--sample-rows N] [--columns col1,col2]");
|
|
2959
|
+
process.exit(1);
|
|
2960
|
+
}
|
|
2961
|
+
const requestedFormat = getArgValue("--format") || "parquet";
|
|
2962
|
+
const targetDir = getArgValue("--target-dir");
|
|
2963
|
+
const compression = getArgValue("--compression");
|
|
2964
|
+
const sampleRows = getArgValue("--sample-rows");
|
|
2965
|
+
const columns = getArgValue("--columns");
|
|
2966
|
+
const fastMode = args.includes("--fast");
|
|
2967
|
+
const preview = args.includes("--preview");
|
|
2968
|
+
const isDirectLocalInput = isDirectLocalDatasetReference(datasetId);
|
|
2969
|
+
const resolvedTargetDir = path.resolve(targetDir || process.cwd());
|
|
2970
|
+
let sourcePath = resolveDatasetLocalPath(datasetId, [resolvedTargetDir, process.cwd()]);
|
|
2971
|
+
if (!sourcePath) {
|
|
2972
|
+
console.error(`Export failed: no local data found for ${datasetId}. Run download or prepare first, or pass a direct local path.`);
|
|
2973
|
+
process.exit(1);
|
|
2974
|
+
}
|
|
2975
|
+
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
2976
|
+
try {
|
|
2977
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
2978
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2981
|
+
catch (e) {
|
|
2982
|
+
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
2983
|
+
}
|
|
2984
|
+
if (!fastMode) {
|
|
2985
|
+
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
2986
|
+
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
2987
|
+
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
2988
|
+
if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
|
|
2989
|
+
try {
|
|
2990
|
+
sourcePath = ensureLocalPipelineSource(sourcePath, datasetId, resolvedTargetDir);
|
|
2991
|
+
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2992
|
+
if (pipelineResult.final_output_path) {
|
|
2993
|
+
sourcePath = pipelineResult.final_output_path;
|
|
2994
|
+
if (!isDirectLocalInput && shouldTrackExportPath(sourcePath)) {
|
|
2995
|
+
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2996
|
+
}
|
|
2997
|
+
}
|
|
2998
|
+
}
|
|
2999
|
+
catch (err) {
|
|
3000
|
+
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
3001
|
+
}
|
|
3002
|
+
}
|
|
3003
|
+
}
|
|
3004
|
+
const exportOpts = {};
|
|
3005
|
+
if (compression)
|
|
3006
|
+
exportOpts.compression = compression;
|
|
3007
|
+
if (preview)
|
|
3008
|
+
exportOpts.preview = true;
|
|
3009
|
+
if (sampleRows)
|
|
3010
|
+
exportOpts.sample_rows = Number(sampleRows);
|
|
3011
|
+
if (columns)
|
|
3012
|
+
exportOpts.columns = columns.split(",").map(col => col.trim()).filter(Boolean);
|
|
3013
|
+
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
3014
|
+
const ext = extMap[requestedFormat] || ".parquet";
|
|
3015
|
+
const safeName = getExportFileStem(datasetId);
|
|
3016
|
+
const outDir = resolvedTargetDir;
|
|
3017
|
+
if (!fs.existsSync(outDir))
|
|
3018
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
3019
|
+
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
3020
|
+
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
3021
|
+
console.error(`[Export] Output file: ${outputFile}`);
|
|
3022
|
+
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
3023
|
+
console.log(`Export complete: ${result.output_path}`);
|
|
3024
|
+
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
3025
|
+
if (result.rows !== undefined)
|
|
3026
|
+
console.log(`Rows: ${result.rows.toLocaleString()}`);
|
|
3027
|
+
if (result.columns !== undefined)
|
|
3028
|
+
console.log(`Columns: ${result.columns}`);
|
|
3029
|
+
if (result.file_size_mb !== undefined)
|
|
3030
|
+
console.log(`Size: ${result.file_size_mb} MB`);
|
|
3031
|
+
if (result.preview_path)
|
|
3032
|
+
console.log(`Preview: ${result.preview_path}`);
|
|
3033
|
+
}
|
|
2279
3034
|
async function runFuseCli(args) {
|
|
2280
3035
|
const getArgValue = (name) => {
|
|
2281
3036
|
const idx = args.findIndex(a => a === name);
|