vesper-wizard 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +1 -1
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
package/build/index.js
DELETED
|
@@ -1,3068 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
// --- Dataset ID Normalization ---
|
|
3
|
-
function normalize_dataset_id(dataset_id) {
|
|
4
|
-
const trimmed = dataset_id.trim();
|
|
5
|
-
const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
|
|
6
|
-
let id = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
|
|
7
|
-
// Replace / and : with _ for filesystem safety
|
|
8
|
-
id = id.replace(/[\\/:]/g, "_");
|
|
9
|
-
if (!sourceMatch) {
|
|
10
|
-
return id;
|
|
11
|
-
}
|
|
12
|
-
const source = sourceMatch[1].toLowerCase() === "hf" ? "huggingface" : sourceMatch[1].toLowerCase();
|
|
13
|
-
return `${source}_${id}`;
|
|
14
|
-
}
|
|
15
|
-
function getDatasetIdAliases(dataset_id) {
|
|
16
|
-
const trimmed = dataset_id.trim();
|
|
17
|
-
const aliases = new Set([trimmed]);
|
|
18
|
-
const sourceMatch = trimmed.match(/^(kaggle|huggingface|hf|openml|dataworld):/i);
|
|
19
|
-
if (sourceMatch) {
|
|
20
|
-
const stripped = trimmed.replace(/^(kaggle|huggingface|hf|openml|dataworld):/i, "");
|
|
21
|
-
aliases.add(stripped);
|
|
22
|
-
if (sourceMatch[1].toLowerCase() === "hf") {
|
|
23
|
-
aliases.add(`huggingface:${stripped}`);
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
else {
|
|
27
|
-
aliases.add(`kaggle:${trimmed}`);
|
|
28
|
-
aliases.add(`huggingface:${trimmed}`);
|
|
29
|
-
aliases.add(`hf:${trimmed}`);
|
|
30
|
-
aliases.add(`openml:${trimmed}`);
|
|
31
|
-
aliases.add(`dataworld:${trimmed}`);
|
|
32
|
-
}
|
|
33
|
-
return Array.from(aliases);
|
|
34
|
-
}
|
|
35
|
-
function toSafeDatasetPathFragment(dataset_id) {
|
|
36
|
-
return normalize_dataset_id(dataset_id);
|
|
37
|
-
}
|
|
38
|
-
// --- Dataset Registry Helpers ---
|
|
39
|
-
function getRegistryPath() {
|
|
40
|
-
return path.join(dataRoot, "registry.json");
|
|
41
|
-
}
|
|
42
|
-
function readRegistry() {
|
|
43
|
-
const registryPath = getRegistryPath();
|
|
44
|
-
if (!fs.existsSync(registryPath))
|
|
45
|
-
return [];
|
|
46
|
-
try {
|
|
47
|
-
const raw = fs.readFileSync(registryPath, "utf-8");
|
|
48
|
-
return JSON.parse(raw);
|
|
49
|
-
}
|
|
50
|
-
catch {
|
|
51
|
-
return [];
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
function writeRegistry(entries) {
|
|
55
|
-
const registryPath = getRegistryPath();
|
|
56
|
-
fs.writeFileSync(registryPath, JSON.stringify(entries, null, 2));
|
|
57
|
-
}
|
|
58
|
-
function upsertRegistry(dataset_id, local_path, status) {
|
|
59
|
-
const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
|
|
60
|
-
const norm_id = aliases[0];
|
|
61
|
-
console.error(`[Registry] Writing key: ${norm_id}`);
|
|
62
|
-
const entries = readRegistry();
|
|
63
|
-
const idx = entries.findIndex(e => aliases.includes(e.dataset_id || e.id));
|
|
64
|
-
if (idx >= 0) {
|
|
65
|
-
entries[idx] = { dataset_id: norm_id, local_path, status };
|
|
66
|
-
}
|
|
67
|
-
else {
|
|
68
|
-
entries.push({ dataset_id: norm_id, local_path, status });
|
|
69
|
-
}
|
|
70
|
-
writeRegistry(entries);
|
|
71
|
-
}
|
|
72
|
-
function getRegistryEntry(dataset_id) {
|
|
73
|
-
const aliases = getDatasetIdAliases(dataset_id).map(normalize_dataset_id);
|
|
74
|
-
console.error(`[Registry] Lookup keys: ${aliases.join(", ")}`);
|
|
75
|
-
return readRegistry().find(e => aliases.includes((e.dataset_id || e.id)));
|
|
76
|
-
}
|
|
77
|
-
const STRUCTURED_FILE_EXTENSIONS = [".parquet", ".csv", ".jsonl", ".json", ".feather", ".arrow", ".tsv", ".txt"];
|
|
78
|
-
const IMAGE_FILE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".webp", ".bmp", ".gif", ".tiff", ".tif", ".svg"]);
|
|
79
|
-
function walkFilesRecursive(rootDir) {
|
|
80
|
-
const out = [];
|
|
81
|
-
const stack = [rootDir];
|
|
82
|
-
while (stack.length > 0) {
|
|
83
|
-
const currentDir = stack.pop();
|
|
84
|
-
const entries = fs.readdirSync(currentDir, { withFileTypes: true });
|
|
85
|
-
for (const entry of entries) {
|
|
86
|
-
const fullPath = path.join(currentDir, entry.name);
|
|
87
|
-
if (entry.isDirectory()) {
|
|
88
|
-
stack.push(fullPath);
|
|
89
|
-
}
|
|
90
|
-
else if (entry.isFile()) {
|
|
91
|
-
out.push(fullPath);
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
out.sort();
|
|
96
|
-
return out;
|
|
97
|
-
}
|
|
98
|
-
function inferImageManifestRecord(rootDir, fullPath, index) {
|
|
99
|
-
const relativePath = path.relative(rootDir, fullPath).replace(/\\/g, "/");
|
|
100
|
-
const parentDir = path.posix.dirname(relativePath);
|
|
101
|
-
const parts = parentDir.split("/").filter(part => part && part !== ".");
|
|
102
|
-
let split;
|
|
103
|
-
let label;
|
|
104
|
-
if (parts.length > 0) {
|
|
105
|
-
const first = parts[0].toLowerCase();
|
|
106
|
-
if (["train", "test", "val", "valid", "validation"].includes(first)) {
|
|
107
|
-
split = parts[0];
|
|
108
|
-
if (parts.length > 1) {
|
|
109
|
-
label = parts[parts.length - 1];
|
|
110
|
-
}
|
|
111
|
-
}
|
|
112
|
-
else {
|
|
113
|
-
label = parts[parts.length - 1];
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
return {
|
|
117
|
-
id: index,
|
|
118
|
-
image_path: path.resolve(fullPath),
|
|
119
|
-
relative_path: relativePath,
|
|
120
|
-
file_name: path.basename(fullPath),
|
|
121
|
-
extension: path.extname(fullPath).toLowerCase().replace(/^\./, ""),
|
|
122
|
-
...(split ? { split } : {}),
|
|
123
|
-
...(label ? { label } : {}),
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
function createImageManifestFromDirectory(rootDir) {
|
|
127
|
-
const imageFiles = walkFilesRecursive(rootDir).filter(filePath => IMAGE_FILE_EXTENSIONS.has(path.extname(filePath).toLowerCase()));
|
|
128
|
-
if (imageFiles.length === 0) {
|
|
129
|
-
throw new Error(`No image files found under ${rootDir}`);
|
|
130
|
-
}
|
|
131
|
-
const manifestPath = path.join(rootDir, "_vesper_image_manifest.jsonl");
|
|
132
|
-
const lines = imageFiles.map((filePath, index) => JSON.stringify(inferImageManifestRecord(rootDir, filePath, index)));
|
|
133
|
-
fs.writeFileSync(manifestPath, `${lines.join("\n")}\n`, "utf-8");
|
|
134
|
-
return manifestPath;
|
|
135
|
-
}
|
|
136
|
-
function ensureExportableLocalPath(localPath) {
|
|
137
|
-
if (!fs.existsSync(localPath)) {
|
|
138
|
-
throw new Error(`Local path not found: ${localPath}`);
|
|
139
|
-
}
|
|
140
|
-
const stats = fs.statSync(localPath);
|
|
141
|
-
if (stats.isFile()) {
|
|
142
|
-
return localPath;
|
|
143
|
-
}
|
|
144
|
-
const manifestPath = path.join(localPath, "_vesper_image_manifest.jsonl");
|
|
145
|
-
if (fs.existsSync(manifestPath)) {
|
|
146
|
-
return manifestPath;
|
|
147
|
-
}
|
|
148
|
-
const candidates = walkFilesRecursive(localPath);
|
|
149
|
-
for (const ext of STRUCTURED_FILE_EXTENSIONS) {
|
|
150
|
-
const match = candidates.find(candidate => path.extname(candidate).toLowerCase() === ext);
|
|
151
|
-
if (match) {
|
|
152
|
-
return match;
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
return createImageManifestFromDirectory(localPath);
|
|
156
|
-
}
|
|
157
|
-
function resolveDatasetLocalPath(datasetIdOrPath) {
|
|
158
|
-
if (fs.existsSync(datasetIdOrPath)) {
|
|
159
|
-
return ensureExportableLocalPath(datasetIdOrPath);
|
|
160
|
-
}
|
|
161
|
-
const downloadStatus = metadataStore.getDownloadStatus(datasetIdOrPath);
|
|
162
|
-
if (downloadStatus?.local_path && fs.existsSync(downloadStatus.local_path)) {
|
|
163
|
-
return ensureExportableLocalPath(downloadStatus.local_path);
|
|
164
|
-
}
|
|
165
|
-
const reg = getRegistryEntry(datasetIdOrPath);
|
|
166
|
-
const regPath = reg?.local_path || reg?.path;
|
|
167
|
-
if (regPath && fs.existsSync(regPath)) {
|
|
168
|
-
return ensureExportableLocalPath(regPath);
|
|
169
|
-
}
|
|
170
|
-
const safeId = toSafeDatasetPathFragment(datasetIdOrPath);
|
|
171
|
-
const rawCandidates = [
|
|
172
|
-
path.join(dataRoot, "data", "raw", `${safeId}.parquet`),
|
|
173
|
-
path.join(dataRoot, "data", "raw", `${safeId}.csv`),
|
|
174
|
-
path.join(dataRoot, "data", "raw", `${safeId}.jsonl`),
|
|
175
|
-
path.join(dataRoot, "data", "raw", `${safeId}.json`),
|
|
176
|
-
path.join(dataRoot, "data", "raw", `${safeId}.feather`),
|
|
177
|
-
path.join(dataRoot, "data", "raw", `${safeId}.arrow`),
|
|
178
|
-
path.join(dataRoot, "data", "raw", safeId),
|
|
179
|
-
];
|
|
180
|
-
const match = rawCandidates.find(candidate => fs.existsSync(candidate));
|
|
181
|
-
return match ? ensureExportableLocalPath(match) : undefined;
|
|
182
|
-
}
|
|
183
|
-
// --- Pipeline State Tracker ---
|
|
184
|
-
// Tracks completed steps per session/job/dataset
|
|
185
|
-
const pipelineState = {};
|
|
186
|
-
const jobStatusLastPoll = {};
|
|
187
|
-
function getPipelineKey(datasetId) {
|
|
188
|
-
return datasetId;
|
|
189
|
-
}
|
|
190
|
-
export function markStepComplete(datasetId, step) {
|
|
191
|
-
const key = getPipelineKey(datasetId);
|
|
192
|
-
if (!pipelineState[key])
|
|
193
|
-
pipelineState[key] = new Set();
|
|
194
|
-
pipelineState[key].add(step);
|
|
195
|
-
}
|
|
196
|
-
export function hasStep(datasetId, step) {
|
|
197
|
-
const key = getPipelineKey(datasetId);
|
|
198
|
-
return pipelineState[key]?.has(step);
|
|
199
|
-
}
|
|
200
|
-
// --- Dataset ID Auto-Detection ---
|
|
201
|
-
export function parseDatasetId(id) {
|
|
202
|
-
const trimmed = id.trim();
|
|
203
|
-
if (/^(kaggle:|hf:|huggingface:|openml:|dataworld:|http|https):/i.test(trimmed))
|
|
204
|
-
return trimmed;
|
|
205
|
-
if (trimmed.includes("/") && !trimmed.includes(":"))
|
|
206
|
-
return `kaggle:${trimmed}`;
|
|
207
|
-
return trimmed;
|
|
208
|
-
}
|
|
209
|
-
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
|
210
|
-
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
211
|
-
import { CallToolRequestSchema, ListToolsRequestSchema, ErrorCode, McpError, } from "@modelcontextprotocol/sdk/types.js";
|
|
212
|
-
import { fileURLToPath } from "url";
|
|
213
|
-
import path from "path";
|
|
214
|
-
import fs from "fs";
|
|
215
|
-
import { spawn } from "child_process";
|
|
216
|
-
import { spawnSync } from "child_process";
|
|
217
|
-
import { MetadataStore } from "./metadata/store.js";
|
|
218
|
-
import { VectorStore } from "./search/vector-store.js";
|
|
219
|
-
import { Embedder } from "./search/embedder.js";
|
|
220
|
-
import { SearchEngine } from "./search/engine.js";
|
|
221
|
-
import { HuggingFaceScraper } from "./metadata/scraper.js";
|
|
222
|
-
import { KaggleSource } from "./metadata/kaggle-source.js";
|
|
223
|
-
import { OpenMLSource } from "./metadata/openml-source.js";
|
|
224
|
-
import { DataWorldSource } from "./metadata/dataworld-source.js";
|
|
225
|
-
import { UnifiedDatasetGateway } from "./gateway/unified-dataset-gateway.js";
|
|
226
|
-
import { formatSearchResults, formatDatasetInfo, formatJobStatus } from "./tools/formatter.js";
|
|
227
|
-
import { JobManager } from "./jobs/manager.js";
|
|
228
|
-
import { QualityAnalyzer } from "./quality/analyzer.js";
|
|
229
|
-
import { CleaningPlanner } from "./cleaning/planner.js";
|
|
230
|
-
import { DataCleaner } from "./cleaning/cleaner.js";
|
|
231
|
-
import { PipelineExecutor } from "./cleaning/executor.js";
|
|
232
|
-
import { DataSplitter } from "./splitting/splitter.js";
|
|
233
|
-
import { DataExporter } from "./export/exporter.js";
|
|
234
|
-
import { DataFusionEngine } from "./fusion/engine.js";
|
|
235
|
-
import { DataIngestor } from "./ingestion/ingestor.js";
|
|
236
|
-
import { InstallService } from "./install/install-service.js";
|
|
237
|
-
import { CacheService, MockRedisProvider } from "./cache/service.js";
|
|
238
|
-
import { ImageAnalyzer } from "./quality/image-analyzer.js";
|
|
239
|
-
import { MediaAnalyzer } from "./quality/media-analyzer.js";
|
|
240
|
-
import { QualityOrchestrator } from "./quality/quality-orchestrator.js";
|
|
241
|
-
import { ConfigManager } from "./config/config-manager.js";
|
|
242
|
-
import { SecureKeysManager } from "./config/secure-keys.js";
|
|
243
|
-
import readline from "readline";
|
|
244
|
-
import os from "os";
|
|
245
|
-
// Determine absolute paths relative to the compiled script
|
|
246
|
-
const __filename = fileURLToPath(import.meta.url);
|
|
247
|
-
const __dirname = path.dirname(__filename);
|
|
248
|
-
// appRoot: Where the source code/scripts are (inside node_modules or source)
|
|
249
|
-
const appRoot = path.join(__dirname, "..");
|
|
250
|
-
// dataRoot: Where database and user data live (in user home)
|
|
251
|
-
// Use os.homedir() as it's more reliable than env vars
|
|
252
|
-
const homeDir = os.homedir() || process.env.HOME || process.env.USERPROFILE || appRoot;
|
|
253
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
254
|
-
// Ensure data directory exists
|
|
255
|
-
if (!fs.existsSync(dataRoot))
|
|
256
|
-
fs.mkdirSync(dataRoot, { recursive: true });
|
|
257
|
-
const dbPath = path.join(dataRoot, "data", "metadata.db");
|
|
258
|
-
const vectorPath = path.join(dataRoot, "data", "vectors.json");
|
|
259
|
-
const errorLogPath = path.join(dataRoot, "vesper_errors.log");
|
|
260
|
-
console.error(`[Vesper] Data directory: ${dataRoot}`);
|
|
261
|
-
console.error(`[Vesper] Database path: ${dbPath}`);
|
|
262
|
-
function logError(err, context) {
|
|
263
|
-
const timestamp = new Date().toISOString();
|
|
264
|
-
const stack = err.stack || String(err);
|
|
265
|
-
const msg = `[${timestamp}] ERROR in ${context}:\n${stack}\n${"-".repeat(50)}\n`;
|
|
266
|
-
fs.appendFileSync(errorLogPath, msg);
|
|
267
|
-
console.error(`[Vesper] Critical error logged to ${errorLogPath}`);
|
|
268
|
-
}
|
|
269
|
-
// --- Request Queue: serialize all MCP tool calls to prevent crashes ---
|
|
270
|
-
class RequestQueue {
|
|
271
|
-
queue = [];
|
|
272
|
-
running = false;
|
|
273
|
-
enqueue(task) {
|
|
274
|
-
return new Promise((resolve, reject) => {
|
|
275
|
-
this.queue.push({ resolve, reject, task });
|
|
276
|
-
this.drain();
|
|
277
|
-
});
|
|
278
|
-
}
|
|
279
|
-
async drain() {
|
|
280
|
-
if (this.running)
|
|
281
|
-
return;
|
|
282
|
-
this.running = true;
|
|
283
|
-
while (this.queue.length > 0) {
|
|
284
|
-
const item = this.queue.shift();
|
|
285
|
-
try {
|
|
286
|
-
const result = await item.task();
|
|
287
|
-
item.resolve(result);
|
|
288
|
-
}
|
|
289
|
-
catch (err) {
|
|
290
|
-
item.reject(err);
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
this.running = false;
|
|
294
|
-
}
|
|
295
|
-
}
|
|
296
|
-
const requestQueue = new RequestQueue();
|
|
297
|
-
const SPINNER_FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
|
298
|
-
function printLaunchScreen() {
|
|
299
|
-
const screen = `
|
|
300
|
-
══════════════════════════════════════════════
|
|
301
|
-
|
|
302
|
-
██ ██ ███████ ███████ ██████ ███████ ██████
|
|
303
|
-
██ ██ ██ ██ ██ ██ ██ ██ ██
|
|
304
|
-
██ ██ █████ ███████ ██████ █████ ██████
|
|
305
|
-
██ ██ ██ ██ ██ ██ ██ ██
|
|
306
|
-
████ ███████ ███████ ██ ███████ ██ ██
|
|
307
|
-
|
|
308
|
-
dataset intelligence layer
|
|
309
|
-
mcp-native • agent-first
|
|
310
|
-
|
|
311
|
-
══════════════════════════════════════════════
|
|
312
|
-
|
|
313
|
-
[ core ] initializing
|
|
314
|
-
[ splitting ] leakage-safe
|
|
315
|
-
[ quality ] multimodal scan
|
|
316
|
-
[ fusion ] guarded
|
|
317
|
-
[ synth ] generation ready
|
|
318
|
-
|
|
319
|
-
status: operational
|
|
320
|
-
`;
|
|
321
|
-
console.error(screen);
|
|
322
|
-
}
|
|
323
|
-
async function runWithSpinner(label, task) {
|
|
324
|
-
if (!process.stderr.isTTY) {
|
|
325
|
-
return task();
|
|
326
|
-
}
|
|
327
|
-
let frameIndex = 0;
|
|
328
|
-
let timer;
|
|
329
|
-
let spinnerShown = false;
|
|
330
|
-
const delayedStart = setTimeout(() => {
|
|
331
|
-
spinnerShown = true;
|
|
332
|
-
timer = setInterval(() => {
|
|
333
|
-
const frame = SPINNER_FRAMES[frameIndex % SPINNER_FRAMES.length];
|
|
334
|
-
frameIndex += 1;
|
|
335
|
-
process.stderr.write(`\r${frame} ${label}`);
|
|
336
|
-
}, 90);
|
|
337
|
-
}, 1000);
|
|
338
|
-
try {
|
|
339
|
-
const result = await task();
|
|
340
|
-
clearTimeout(delayedStart);
|
|
341
|
-
if (timer)
|
|
342
|
-
clearInterval(timer);
|
|
343
|
-
if (spinnerShown)
|
|
344
|
-
process.stderr.write(`\r[ok] ${label} \n`);
|
|
345
|
-
return result;
|
|
346
|
-
}
|
|
347
|
-
catch (error) {
|
|
348
|
-
clearTimeout(delayedStart);
|
|
349
|
-
if (timer)
|
|
350
|
-
clearInterval(timer);
|
|
351
|
-
if (spinnerShown)
|
|
352
|
-
process.stderr.write(`\r[error] ${label} \n`);
|
|
353
|
-
throw error;
|
|
354
|
-
}
|
|
355
|
-
}
|
|
356
|
-
function extractRequestedRows(query, requirements) {
|
|
357
|
-
const text = `${query || ""} ${requirements || ""}`.toLowerCase();
|
|
358
|
-
const explicit = text.match(/(\d[\d,\s]{1,12})\s*(samples?|rows?|records?)/i);
|
|
359
|
-
if (explicit) {
|
|
360
|
-
const n = Number(explicit[1].replace(/[\s,]/g, ""));
|
|
361
|
-
if (Number.isFinite(n) && n > 0)
|
|
362
|
-
return n;
|
|
363
|
-
}
|
|
364
|
-
const commaNumbers = [...text.matchAll(/\b\d{1,3}(?:,\d{3})+\b/g)]
|
|
365
|
-
.map(m => Number(m[0].replace(/,/g, "")))
|
|
366
|
-
.filter(n => Number.isFinite(n) && n > 0);
|
|
367
|
-
if (commaNumbers.length > 0)
|
|
368
|
-
return Math.max(...commaNumbers);
|
|
369
|
-
const humanSized = [...text.matchAll(/\b(\d+(?:\.\d+)?)\s*([kmb])\b/gi)]
|
|
370
|
-
.map(m => {
|
|
371
|
-
const base = Number(m[1]);
|
|
372
|
-
const suffix = m[2].toLowerCase();
|
|
373
|
-
const multiplier = suffix === "k" ? 1_000 : suffix === "m" ? 1_000_000 : 1_000_000_000;
|
|
374
|
-
return Math.round(base * multiplier);
|
|
375
|
-
})
|
|
376
|
-
.filter(n => Number.isFinite(n) && n > 0);
|
|
377
|
-
if (humanSized.length > 0)
|
|
378
|
-
return Math.max(...humanSized);
|
|
379
|
-
const allNums = [...text.matchAll(/\b\d{4,9}\b/g)]
|
|
380
|
-
.map(m => Number(m[0]))
|
|
381
|
-
.filter(n => Number.isFinite(n) && n > 0);
|
|
382
|
-
if (allNums.length > 0)
|
|
383
|
-
return Math.max(...allNums);
|
|
384
|
-
return undefined;
|
|
385
|
-
}
|
|
386
|
-
const verifiedPythonModules = new Set();
|
|
387
|
-
function getPythonCommand() {
|
|
388
|
-
return process.platform === "win32" ? "py" : "python";
|
|
389
|
-
}
|
|
390
|
-
function runPythonProcess(args, timeoutMs = 300000) {
|
|
391
|
-
const pyCmd = getPythonCommand();
|
|
392
|
-
return new Promise((resolve, reject) => {
|
|
393
|
-
const proc = spawn(pyCmd, args, {
|
|
394
|
-
env: {
|
|
395
|
-
...process.env,
|
|
396
|
-
PIP_DISABLE_PIP_VERSION_CHECK: "1",
|
|
397
|
-
PYTHONUTF8: "1",
|
|
398
|
-
},
|
|
399
|
-
});
|
|
400
|
-
let stdout = "";
|
|
401
|
-
let stderr = "";
|
|
402
|
-
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
403
|
-
proc.stderr.on("data", (d) => (stderr += d.toString()));
|
|
404
|
-
const timer = setTimeout(() => {
|
|
405
|
-
try {
|
|
406
|
-
proc.kill();
|
|
407
|
-
}
|
|
408
|
-
catch {
|
|
409
|
-
// no-op
|
|
410
|
-
}
|
|
411
|
-
reject(new Error(`Python command timed out after ${timeoutMs}ms: ${args.join(" ")}`));
|
|
412
|
-
}, timeoutMs);
|
|
413
|
-
proc.on("close", (code) => {
|
|
414
|
-
clearTimeout(timer);
|
|
415
|
-
resolve({ code: code ?? 1, stdout, stderr });
|
|
416
|
-
});
|
|
417
|
-
proc.on("error", (error) => {
|
|
418
|
-
clearTimeout(timer);
|
|
419
|
-
reject(error);
|
|
420
|
-
});
|
|
421
|
-
});
|
|
422
|
-
}
|
|
423
|
-
async function ensurePythonModules(modulePackagePairs) {
|
|
424
|
-
const missing = [];
|
|
425
|
-
for (const pair of modulePackagePairs) {
|
|
426
|
-
if (verifiedPythonModules.has(pair.module)) {
|
|
427
|
-
continue;
|
|
428
|
-
}
|
|
429
|
-
const check = await runPythonProcess([
|
|
430
|
-
"-c",
|
|
431
|
-
`import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(pair.module)}) else 1)`
|
|
432
|
-
], 20000);
|
|
433
|
-
if (check.code === 0) {
|
|
434
|
-
verifiedPythonModules.add(pair.module);
|
|
435
|
-
}
|
|
436
|
-
else {
|
|
437
|
-
missing.push(pair);
|
|
438
|
-
}
|
|
439
|
-
}
|
|
440
|
-
if (missing.length === 0) {
|
|
441
|
-
return;
|
|
442
|
-
}
|
|
443
|
-
const packages = [...new Set(missing.map(m => m.packageName))];
|
|
444
|
-
console.error(`[Vesper] Installing missing Python packages: ${packages.join(", ")}`);
|
|
445
|
-
const installArgs = ["-m", "pip", "install", "--disable-pip-version-check", ...packages];
|
|
446
|
-
let install = await runPythonProcess(installArgs, 600000);
|
|
447
|
-
if (install.code !== 0) {
|
|
448
|
-
console.error(`[Vesper] pip install failed (trying --user fallback): ${(install.stderr || "").slice(0, 300)}`);
|
|
449
|
-
const userInstallArgs = ["-m", "pip", "install", "--disable-pip-version-check", "--user", ...packages];
|
|
450
|
-
install = await runPythonProcess(userInstallArgs, 600000);
|
|
451
|
-
}
|
|
452
|
-
if (install.code !== 0) {
|
|
453
|
-
const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
|
|
454
|
-
throw new Error(`Failed to install required Python packages (${packages.join(", ")}). ${details}`);
|
|
455
|
-
}
|
|
456
|
-
console.error(`[Vesper] Successfully installed: ${packages.join(", ")}`);
|
|
457
|
-
for (const pair of missing) {
|
|
458
|
-
verifiedPythonModules.add(pair.module);
|
|
459
|
-
}
|
|
460
|
-
}
|
|
461
|
-
function runPythonJson(scriptPath, args) {
|
|
462
|
-
const pyCmd = getPythonCommand();
|
|
463
|
-
return new Promise((resolve, reject) => {
|
|
464
|
-
const proc = spawn(pyCmd, [scriptPath, ...args]);
|
|
465
|
-
let stdout = "";
|
|
466
|
-
let stderr = "";
|
|
467
|
-
proc.stdout.on("data", (d) => (stdout += d.toString()));
|
|
468
|
-
proc.stderr.on("data", (d) => (stderr += d.toString()));
|
|
469
|
-
proc.on("close", (code) => {
|
|
470
|
-
if (code !== 0) {
|
|
471
|
-
reject(new Error(stderr || stdout || `Python exited with ${code}`));
|
|
472
|
-
return;
|
|
473
|
-
}
|
|
474
|
-
try {
|
|
475
|
-
resolve(JSON.parse(stdout));
|
|
476
|
-
}
|
|
477
|
-
catch {
|
|
478
|
-
reject(new Error(`Invalid JSON from python helper: ${stdout}`));
|
|
479
|
-
}
|
|
480
|
-
});
|
|
481
|
-
});
|
|
482
|
-
}
|
|
483
|
-
async function countRows(filePath) {
|
|
484
|
-
const scriptPath = path.join(dataRoot, "python", "row_count.py");
|
|
485
|
-
const result = await runPythonJson(scriptPath, [filePath]);
|
|
486
|
-
if (!result.ok)
|
|
487
|
-
throw new Error(result.error || "Failed to count rows");
|
|
488
|
-
return Number(result.rows || 0);
|
|
489
|
-
}
|
|
490
|
-
/**
|
|
491
|
-
* Sync Python scripts from the application package to the stable data directory (~/.vesper/python)
|
|
492
|
-
*/
|
|
493
|
-
function syncPythonScripts(appRoot, dataRoot) {
|
|
494
|
-
const pythonDest = path.join(dataRoot, "python");
|
|
495
|
-
if (!fs.existsSync(pythonDest))
|
|
496
|
-
fs.mkdirSync(pythonDest, { recursive: true });
|
|
497
|
-
const collectPyFiles = (dir) => {
|
|
498
|
-
if (!fs.existsSync(dir))
|
|
499
|
-
return [];
|
|
500
|
-
const out = [];
|
|
501
|
-
const stack = [dir];
|
|
502
|
-
while (stack.length > 0) {
|
|
503
|
-
const cur = stack.pop();
|
|
504
|
-
for (const entry of fs.readdirSync(cur, { withFileTypes: true })) {
|
|
505
|
-
const full = path.join(cur, entry.name);
|
|
506
|
-
if (entry.isDirectory()) {
|
|
507
|
-
stack.push(full);
|
|
508
|
-
}
|
|
509
|
-
else if (entry.isFile() && full.endsWith(".py")) {
|
|
510
|
-
out.push(full);
|
|
511
|
-
}
|
|
512
|
-
}
|
|
513
|
-
}
|
|
514
|
-
return out;
|
|
515
|
-
};
|
|
516
|
-
// Sources to check for Python scripts
|
|
517
|
-
const sources = [
|
|
518
|
-
path.join(appRoot, "src", "python"),
|
|
519
|
-
path.join(appRoot, "build", "python"),
|
|
520
|
-
path.join(appRoot, "python")
|
|
521
|
-
];
|
|
522
|
-
let syncedCount = 0;
|
|
523
|
-
for (const src of sources) {
|
|
524
|
-
const files = collectPyFiles(src);
|
|
525
|
-
for (const srcPath of files) {
|
|
526
|
-
const rel = path.relative(src, srcPath);
|
|
527
|
-
const destPath = path.join(pythonDest, rel);
|
|
528
|
-
const srcStat = fs.statSync(srcPath);
|
|
529
|
-
let shouldCopy = true;
|
|
530
|
-
if (fs.existsSync(destPath)) {
|
|
531
|
-
const destStat = fs.statSync(destPath);
|
|
532
|
-
if (srcStat.size === destStat.size && srcStat.mtimeMs <= destStat.mtimeMs)
|
|
533
|
-
shouldCopy = false;
|
|
534
|
-
}
|
|
535
|
-
if (shouldCopy) {
|
|
536
|
-
fs.mkdirSync(path.dirname(destPath), { recursive: true });
|
|
537
|
-
fs.copyFileSync(srcPath, destPath);
|
|
538
|
-
syncedCount++;
|
|
539
|
-
}
|
|
540
|
-
}
|
|
541
|
-
}
|
|
542
|
-
if (syncedCount > 0) {
|
|
543
|
-
console.error(`[Vesper] Synced ${syncedCount} Python scripts to ${pythonDest}`);
|
|
544
|
-
}
|
|
545
|
-
}
|
|
546
|
-
// Sync scripts immediately
|
|
547
|
-
syncPythonScripts(appRoot, dataRoot);
|
|
548
|
-
// Auto-rebuild better-sqlite3 if native binary doesn't match current Node version
|
|
549
|
-
function tryRebuildSqlite() {
|
|
550
|
-
try {
|
|
551
|
-
const { execSync } = require("child_process");
|
|
552
|
-
const pkgRoot = path.resolve(__dirname, "..");
|
|
553
|
-
console.error("[Vesper] Rebuilding better-sqlite3 for Node " + process.version + "...");
|
|
554
|
-
execSync("npm rebuild better-sqlite3", {
|
|
555
|
-
stdio: "pipe",
|
|
556
|
-
timeout: 60000,
|
|
557
|
-
cwd: pkgRoot,
|
|
558
|
-
});
|
|
559
|
-
console.error("[Vesper] Rebuild succeeded. Retrying...");
|
|
560
|
-
// Clear require cache so the rebuilt module is loaded
|
|
561
|
-
for (const key of Object.keys(require.cache)) {
|
|
562
|
-
if (key.includes("better-sqlite3") || key.includes("better_sqlite3")) {
|
|
563
|
-
delete require.cache[key];
|
|
564
|
-
}
|
|
565
|
-
}
|
|
566
|
-
return true;
|
|
567
|
-
}
|
|
568
|
-
catch (e) {
|
|
569
|
-
console.error("[Vesper] Auto-rebuild failed: " + (e?.message || e));
|
|
570
|
-
return false;
|
|
571
|
-
}
|
|
572
|
-
}
|
|
573
|
-
let metadataStore;
|
|
574
|
-
try {
|
|
575
|
-
metadataStore = new MetadataStore(dbPath);
|
|
576
|
-
}
|
|
577
|
-
catch (e) {
|
|
578
|
-
if (e?.code === "ERR_DLOPEN_FAILED" && tryRebuildSqlite()) {
|
|
579
|
-
metadataStore = new MetadataStore(dbPath);
|
|
580
|
-
}
|
|
581
|
-
else {
|
|
582
|
-
console.error("[Vesper] FATAL: Cannot load better-sqlite3.");
|
|
583
|
-
console.error("[Vesper] Run: npm rebuild better-sqlite3");
|
|
584
|
-
throw e;
|
|
585
|
-
}
|
|
586
|
-
}
|
|
587
|
-
const vectorStore = new VectorStore(vectorPath);
|
|
588
|
-
const embedder = Embedder.getInstance();
|
|
589
|
-
const searchEngine = new SearchEngine(metadataStore, vectorStore, embedder);
|
|
590
|
-
const jobManager = JobManager.getInstance(metadataStore);
|
|
591
|
-
// Use dataRoot for storage services (persistence)
|
|
592
|
-
const dataIngestor = new DataIngestor(dataRoot, metadataStore);
|
|
593
|
-
const installService = new InstallService(dataRoot, metadataStore);
|
|
594
|
-
const cacheService = new CacheService(new MockRedisProvider());
|
|
595
|
-
const dataCleaner = new DataCleaner(__dirname);
|
|
596
|
-
const pipelineExecutor = new PipelineExecutor(dataRoot, __dirname);
|
|
597
|
-
const dataSplitter = new DataSplitter(__dirname);
|
|
598
|
-
const dataExporter = new DataExporter(__dirname);
|
|
599
|
-
const fusionEngine = new DataFusionEngine(__dirname);
|
|
600
|
-
const kaggleSource = new KaggleSource(__dirname);
|
|
601
|
-
const openmlSource = new OpenMLSource(__dirname);
|
|
602
|
-
const dataworldSource = new DataWorldSource(__dirname);
|
|
603
|
-
const secureKeys = new SecureKeysManager(__dirname);
|
|
604
|
-
function hydrateExternalKeys() {
|
|
605
|
-
const keys = secureKeys.getAll();
|
|
606
|
-
if (!process.env.HF_TOKEN && !process.env.HUGGINGFACE_TOKEN && keys.hf_token) {
|
|
607
|
-
process.env.HF_TOKEN = String(keys.hf_token);
|
|
608
|
-
}
|
|
609
|
-
if (!process.env.KAGGLE_USERNAME && keys.kaggle_username) {
|
|
610
|
-
process.env.KAGGLE_USERNAME = String(keys.kaggle_username);
|
|
611
|
-
}
|
|
612
|
-
if (!process.env.KAGGLE_KEY && keys.kaggle_key) {
|
|
613
|
-
process.env.KAGGLE_KEY = String(keys.kaggle_key);
|
|
614
|
-
}
|
|
615
|
-
if (!process.env.DW_AUTH_TOKEN && keys.dataworld_token) {
|
|
616
|
-
process.env.DW_AUTH_TOKEN = String(keys.dataworld_token);
|
|
617
|
-
}
|
|
618
|
-
}
|
|
619
|
-
function hasDataWorldToken() {
|
|
620
|
-
return !!(process.env.DW_AUTH_TOKEN || secureKeys.getAll().dataworld_token);
|
|
621
|
-
}
|
|
622
|
-
const unifiedDatasetGateway = new UnifiedDatasetGateway({
|
|
623
|
-
metadataStore,
|
|
624
|
-
dataIngestor,
|
|
625
|
-
dataRoot,
|
|
626
|
-
kaggleSource,
|
|
627
|
-
openmlSource,
|
|
628
|
-
dataworldSource,
|
|
629
|
-
hasDataWorldToken,
|
|
630
|
-
});
|
|
631
|
-
// CRITICAL FIX: Pass __dirname (build directory) to analyzers
|
|
632
|
-
// Python scripts are in build/python/, so analyzers should look relative to build/
|
|
633
|
-
// NOT relative to project root (appRoot)
|
|
634
|
-
process.env.PYTHONIOENCODING = "utf-8";
|
|
635
|
-
const qualityAnalyzer = new QualityAnalyzer(cacheService, __dirname);
|
|
636
|
-
const cleaningPlanner = new CleaningPlanner(cacheService, __dirname); // Pass __dirname for TargetDetector
|
|
637
|
-
const imageAnalyzer = new ImageAnalyzer(__dirname);
|
|
638
|
-
const mediaAnalyzer = new MediaAnalyzer(__dirname);
|
|
639
|
-
const qualityOrchestrator = new QualityOrchestrator(__dirname);
|
|
640
|
-
// Subscribe to job updates for real-time streaming to the UI
|
|
641
|
-
jobManager.on("jobUpdated", (job) => {
|
|
642
|
-
const level = job.status === "failed" ? "error" : "info";
|
|
643
|
-
const statusTag = job.status === "completed" ? "done" : (job.status === "failed" ? "failed" : "running");
|
|
644
|
-
const progress = job.progress > 0 ? `[${job.progress}%]` : "";
|
|
645
|
-
server.sendLoggingMessage({
|
|
646
|
-
level,
|
|
647
|
-
data: `[${statusTag}] [Job ${job.id.substring(0, 8)}] ${progress} ${job.status_text}`
|
|
648
|
-
});
|
|
649
|
-
});
|
|
650
|
-
// IMPORTANT: Execute jobs when the manager emits them
|
|
651
|
-
jobManager.on("processJob", async (job, execute) => {
|
|
652
|
-
console.error(`[Vesper] Listener RECEIVED job: ${job?.id}, execute type: ${typeof execute}`);
|
|
653
|
-
if (typeof execute !== 'function') {
|
|
654
|
-
console.error(`[CRITICAL] execute is NOT a function! It is: ${typeof execute}`);
|
|
655
|
-
logError(new Error(`execute is ${typeof execute}`), "listener:execute_check");
|
|
656
|
-
return;
|
|
657
|
-
}
|
|
658
|
-
const prepareDatasetTask = async () => {
|
|
659
|
-
console.error(`[Vesper] Starting prepareDatasetTask for job ${job.id}...`);
|
|
660
|
-
const metadata = job.metadata ? JSON.parse(job.metadata) : {};
|
|
661
|
-
switch (job.type) {
|
|
662
|
-
case "prepare": return await handlePrepareJob(job.id, metadata.query, metadata.requirements, metadata.outputDir);
|
|
663
|
-
case "clean": return await handleCleanJob(job.id, metadata.datasetId, metadata.ops);
|
|
664
|
-
default: throw new Error(`Unhandled job type: ${job.type}`);
|
|
665
|
-
}
|
|
666
|
-
};
|
|
667
|
-
try {
|
|
668
|
-
console.error(`[Vesper] Calling execute(prepareDatasetTask) for ${job.id}...`);
|
|
669
|
-
await execute(prepareDatasetTask);
|
|
670
|
-
console.error(`[Vesper] execute(prepareDatasetTask) COMPLETED for ${job.id}`);
|
|
671
|
-
}
|
|
672
|
-
catch (e) {
|
|
673
|
-
logError(e, `processJob:${job.type}:${job.id}`);
|
|
674
|
-
console.error(`[Vesper] Error in execute wrapper for ${job.id}: ${e.message}`);
|
|
675
|
-
}
|
|
676
|
-
});
|
|
677
|
-
/**
|
|
678
|
-
* Logic for preparing a dataset (Search + Ingest + Process)
|
|
679
|
-
*/
|
|
680
|
-
async function handlePrepareJob(jobId, query, requirements, outputDir) {
|
|
681
|
-
hydrateExternalKeys();
|
|
682
|
-
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
683
|
-
const pipelineSteps = ["search", "validate", "download", "normalize", "quality", "register"];
|
|
684
|
-
const stepStatus = {};
|
|
685
|
-
for (const s of pipelineSteps)
|
|
686
|
-
stepStatus[s] = "pending";
|
|
687
|
-
const markPipelineStep = (step, status) => {
|
|
688
|
-
stepStatus[step] = status;
|
|
689
|
-
const summary = pipelineSteps.map(s => {
|
|
690
|
-
const st = stepStatus[s];
|
|
691
|
-
return st === "done" ? `[${s}]` : st === "running" ? `>${s}<` : st === "failed" ? `!${s}!` : st === "skipped" ? `~${s}~` : ` ${s} `;
|
|
692
|
-
}).join(" → ");
|
|
693
|
-
console.error(`[Pipeline] ${summary}`);
|
|
694
|
-
};
|
|
695
|
-
// Ensure core Python packages are available for dataset operations
|
|
696
|
-
try {
|
|
697
|
-
await ensurePythonModules([
|
|
698
|
-
{ module: "polars", packageName: "polars" },
|
|
699
|
-
{ module: "datasets", packageName: "datasets" },
|
|
700
|
-
]);
|
|
701
|
-
}
|
|
702
|
-
catch (e) {
|
|
703
|
-
console.error(`[Prepare] Python dependency setup warning: ${e.message}`);
|
|
704
|
-
// Continue anyway - direct file downloads may still work without datasets lib
|
|
705
|
-
}
|
|
706
|
-
const requestedRows = extractRequestedRows(query, requirements);
|
|
707
|
-
const searchQuery = requirements ? `${query} ${requirements}` : query;
|
|
708
|
-
let selectedDataset;
|
|
709
|
-
let datasetIdForDownload = "";
|
|
710
|
-
let source;
|
|
711
|
-
const parsedQuery = parseDatasetId(query);
|
|
712
|
-
const isExplicitDatasetRef = /^(kaggle:|hf:|huggingface:|openml:|dataworld:)/i.test(parsedQuery) || (query.includes("/") && !query.includes(" "));
|
|
713
|
-
if (isExplicitDatasetRef) {
|
|
714
|
-
let explicitId = parsedQuery;
|
|
715
|
-
if (/^hf:/i.test(explicitId)) {
|
|
716
|
-
explicitId = explicitId.replace(/^hf:/i, "huggingface:");
|
|
717
|
-
}
|
|
718
|
-
if (/^kaggle:/i.test(explicitId)) {
|
|
719
|
-
source = "kaggle";
|
|
720
|
-
datasetIdForDownload = explicitId.replace(/^kaggle:/i, "");
|
|
721
|
-
}
|
|
722
|
-
else if (/^huggingface:/i.test(explicitId)) {
|
|
723
|
-
source = "huggingface";
|
|
724
|
-
datasetIdForDownload = explicitId.replace(/^huggingface:/i, "");
|
|
725
|
-
}
|
|
726
|
-
else if (/^openml:/i.test(explicitId)) {
|
|
727
|
-
source = "openml";
|
|
728
|
-
datasetIdForDownload = explicitId.replace(/^openml:/i, "");
|
|
729
|
-
}
|
|
730
|
-
else if (/^dataworld:/i.test(explicitId)) {
|
|
731
|
-
source = "dataworld";
|
|
732
|
-
datasetIdForDownload = explicitId.replace(/^dataworld:/i, "");
|
|
733
|
-
}
|
|
734
|
-
else {
|
|
735
|
-
// Default to HuggingFace for ambiguous refs (user/dataset without prefix)
|
|
736
|
-
source = "huggingface";
|
|
737
|
-
datasetIdForDownload = explicitId;
|
|
738
|
-
}
|
|
739
|
-
update({
|
|
740
|
-
progress: 20,
|
|
741
|
-
status_text: `Using explicit dataset id: ${datasetIdForDownload} (${source})`
|
|
742
|
-
});
|
|
743
|
-
markPipelineStep("search", "skipped");
|
|
744
|
-
}
|
|
745
|
-
else {
|
|
746
|
-
markPipelineStep("search", "running");
|
|
747
|
-
update({ progress: 10, status_text: "Searching for best dataset matching query..." });
|
|
748
|
-
const results = await searchEngine.search(searchQuery, { limit: 10 });
|
|
749
|
-
if (results.length === 0) {
|
|
750
|
-
markPipelineStep("search", "failed");
|
|
751
|
-
throw new Error("No datasets found matching the query. Try refining your search terms.");
|
|
752
|
-
}
|
|
753
|
-
// Pick the best result that we can actually download (skip sources requiring missing credentials)
|
|
754
|
-
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
755
|
-
const hasDwToken = hasDataWorldToken();
|
|
756
|
-
selectedDataset = results.find(r => {
|
|
757
|
-
const s = (r.source || "").toLowerCase();
|
|
758
|
-
if (s === "kaggle" && !hasKaggleCreds)
|
|
759
|
-
return false;
|
|
760
|
-
if (s === "dataworld" && !hasDwToken)
|
|
761
|
-
return false;
|
|
762
|
-
return true;
|
|
763
|
-
}) || results[0]; // Fallback to first if all require credentials
|
|
764
|
-
datasetIdForDownload = selectedDataset.id;
|
|
765
|
-
source = selectedDataset.source;
|
|
766
|
-
update({
|
|
767
|
-
progress: 20,
|
|
768
|
-
status_text: `Matched: ${selectedDataset.name} (${selectedDataset.source})`
|
|
769
|
-
});
|
|
770
|
-
markPipelineStep("search", "done");
|
|
771
|
-
}
|
|
772
|
-
// Pre-check credentials for sources that require them
|
|
773
|
-
markPipelineStep("validate", "running");
|
|
774
|
-
if (source === "kaggle") {
|
|
775
|
-
const hasKaggleCreds = !!(process.env.KAGGLE_USERNAME && process.env.KAGGLE_KEY && process.env.KAGGLE_USERNAME !== "YOUR_KAGGLE_USERNAME");
|
|
776
|
-
if (!hasKaggleCreds) {
|
|
777
|
-
throw new Error("Kaggle credentials not set. Use the configure_keys tool or set KAGGLE_USERNAME/KAGGLE_KEY environment variables.");
|
|
778
|
-
}
|
|
779
|
-
}
|
|
780
|
-
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
781
|
-
markPipelineStep("validate", "failed");
|
|
782
|
-
throw new Error("data.world token not set. Use the configure_keys tool or set DW_AUTH_TOKEN environment variable.");
|
|
783
|
-
}
|
|
784
|
-
markPipelineStep("validate", "done");
|
|
785
|
-
markPipelineStep("download", "running");
|
|
786
|
-
update({ progress: 30, status_text: `Starting download from ${source}...` });
|
|
787
|
-
// ensureData handles download and returns path to the raw file
|
|
788
|
-
let rawFilePath = await dataIngestor.ensureData(datasetIdForDownload, source, (msg, prog) => {
|
|
789
|
-
update({ status_text: msg, progress: 30 + (prog ? Math.floor(prog * 0.4) : 0) });
|
|
790
|
-
});
|
|
791
|
-
if (requestedRows && requestedRows > 0 && !isExplicitDatasetRef) {
|
|
792
|
-
update({ progress: 62, status_text: `Validating requested sample count (${requestedRows.toLocaleString()})...` });
|
|
793
|
-
let currentRows = await countRows(rawFilePath);
|
|
794
|
-
if (currentRows < requestedRows) {
|
|
795
|
-
update({ progress: 64, status_text: `Only ${currentRows.toLocaleString()} rows found. Fetching more matching datasets...` });
|
|
796
|
-
const additional = await searchEngine.search(searchQuery, { limit: 8 });
|
|
797
|
-
const sourceFiles = [rawFilePath];
|
|
798
|
-
let totalRows = currentRows;
|
|
799
|
-
for (const ds of additional) {
|
|
800
|
-
if (ds.id === datasetIdForDownload)
|
|
801
|
-
continue;
|
|
802
|
-
try {
|
|
803
|
-
const dsSource = ds.source;
|
|
804
|
-
if (dsSource === "kaggle" && !dataIngestor.hasKaggleCredentials())
|
|
805
|
-
continue;
|
|
806
|
-
const p = await dataIngestor.ensureData(ds.id, dsSource, () => undefined);
|
|
807
|
-
const r = await countRows(p);
|
|
808
|
-
if (r <= 0)
|
|
809
|
-
continue;
|
|
810
|
-
sourceFiles.push(p);
|
|
811
|
-
totalRows += r;
|
|
812
|
-
if (totalRows >= requestedRows)
|
|
813
|
-
break;
|
|
814
|
-
}
|
|
815
|
-
catch {
|
|
816
|
-
// ignore candidate failures and continue trying
|
|
817
|
-
}
|
|
818
|
-
}
|
|
819
|
-
if (sourceFiles.length > 1) {
|
|
820
|
-
update({ progress: 67, status_text: `Fusing ${sourceFiles.length} datasets to meet row target...` });
|
|
821
|
-
const fusedPath = path.join(dataRoot, "fusion", `prepare_fused_${Date.now()}.feather`);
|
|
822
|
-
const fusionResult = await fusionEngine.fuse(sourceFiles, fusedPath, {
|
|
823
|
-
strategy: "concat",
|
|
824
|
-
dedup: true,
|
|
825
|
-
run_quality_after: false,
|
|
826
|
-
leakage_check: false,
|
|
827
|
-
output_format: "feather",
|
|
828
|
-
compression: "lz4",
|
|
829
|
-
preview: true,
|
|
830
|
-
});
|
|
831
|
-
rawFilePath = fusionResult.output_path;
|
|
832
|
-
try {
|
|
833
|
-
// Register fused output for this top dataset so export can find it
|
|
834
|
-
upsertRegistry(datasetIdForDownload, rawFilePath, "completed");
|
|
835
|
-
}
|
|
836
|
-
catch (e) {
|
|
837
|
-
console.error(`[Registry] Failed to write registry for fused output ${datasetIdForDownload}: ${e?.message || e}`);
|
|
838
|
-
}
|
|
839
|
-
currentRows = await countRows(rawFilePath);
|
|
840
|
-
}
|
|
841
|
-
if (currentRows < requestedRows) {
|
|
842
|
-
throw new Error(`Requested ${requestedRows.toLocaleString()} samples, but only ${currentRows.toLocaleString()} available across current matches. ` +
|
|
843
|
-
`Try broader query or enable additional sources.`);
|
|
844
|
-
}
|
|
845
|
-
update({ progress: 69, status_text: `Sample target met: ${currentRows.toLocaleString()} rows` });
|
|
846
|
-
}
|
|
847
|
-
}
|
|
848
|
-
markPipelineStep("download", "done");
|
|
849
|
-
// ── Normalize step: convert any raw format → parquet ──
|
|
850
|
-
markPipelineStep("normalize", "running");
|
|
851
|
-
const rawExt = path.extname(rawFilePath).toLowerCase();
|
|
852
|
-
if (rawExt !== ".parquet" && rawExt !== ".pq") {
|
|
853
|
-
update({ progress: 70, status_text: "Normalizing to parquet..." });
|
|
854
|
-
const normalizedDir = path.join(dataRoot, "data", "normalized");
|
|
855
|
-
if (!fs.existsSync(normalizedDir))
|
|
856
|
-
fs.mkdirSync(normalizedDir, { recursive: true });
|
|
857
|
-
const safeId = toSafeDatasetPathFragment(datasetIdForDownload);
|
|
858
|
-
const normalizedPath = path.join(normalizedDir, `${safeId}.parquet`);
|
|
859
|
-
try {
|
|
860
|
-
const normScript = path.join(dataRoot, "python", "normalize_engine.py");
|
|
861
|
-
const normResult = await runPythonJson(normScript, [rawFilePath, normalizedPath]);
|
|
862
|
-
if (normResult.ok && fs.existsSync(normalizedPath)) {
|
|
863
|
-
console.error(`[Prepare] Normalized ${rawExt} → parquet (${normResult.rows} rows)`);
|
|
864
|
-
rawFilePath = normalizedPath;
|
|
865
|
-
markPipelineStep("normalize", "done");
|
|
866
|
-
}
|
|
867
|
-
else {
|
|
868
|
-
console.error(`[Prepare] Normalize failed: ${normResult.error}, continuing with raw file`);
|
|
869
|
-
markPipelineStep("normalize", "skipped");
|
|
870
|
-
}
|
|
871
|
-
}
|
|
872
|
-
catch (e) {
|
|
873
|
-
console.error(`[Prepare] Normalize step failed: ${e?.message || e}, continuing with raw file`);
|
|
874
|
-
markPipelineStep("normalize", "skipped");
|
|
875
|
-
}
|
|
876
|
-
}
|
|
877
|
-
else {
|
|
878
|
-
markPipelineStep("normalize", "done");
|
|
879
|
-
}
|
|
880
|
-
let qualityScore = selectedDataset?.quality_score ?? 70;
|
|
881
|
-
markPipelineStep("quality", "running");
|
|
882
|
-
update({ progress: 75, status_text: "Analyzing dataset quality..." });
|
|
883
|
-
try {
|
|
884
|
-
const report = await qualityAnalyzer.analyze(rawFilePath);
|
|
885
|
-
qualityScore = report.overall_score;
|
|
886
|
-
markPipelineStep("quality", "done");
|
|
887
|
-
}
|
|
888
|
-
catch (error) {
|
|
889
|
-
console.error(`[Prepare] Quality analysis failed for ${datasetIdForDownload}: ${error?.message || error}`);
|
|
890
|
-
update({ progress: 78, status_text: "Quality analysis skipped (unsupported schema). Continuing installation..." });
|
|
891
|
-
markPipelineStep("quality", "skipped");
|
|
892
|
-
}
|
|
893
|
-
if (selectedDataset) {
|
|
894
|
-
metadataStore.saveDataset({
|
|
895
|
-
...selectedDataset,
|
|
896
|
-
quality_score: qualityScore
|
|
897
|
-
});
|
|
898
|
-
}
|
|
899
|
-
else {
|
|
900
|
-
// Even for explicit dataset refs, save minimal metadata so get_dataset_info can find it
|
|
901
|
-
try {
|
|
902
|
-
const existingMeta = metadataStore.getDataset(datasetIdForDownload);
|
|
903
|
-
if (!existingMeta) {
|
|
904
|
-
metadataStore.saveDataset({
|
|
905
|
-
id: datasetIdForDownload,
|
|
906
|
-
source: source,
|
|
907
|
-
name: datasetIdForDownload.split("/").pop() || datasetIdForDownload,
|
|
908
|
-
description: `Dataset prepared from ${source}:${datasetIdForDownload}`,
|
|
909
|
-
quality_warnings: [],
|
|
910
|
-
downloads: 0,
|
|
911
|
-
likes: 0,
|
|
912
|
-
stars: 0,
|
|
913
|
-
tags: [],
|
|
914
|
-
last_updated: new Date().toISOString(),
|
|
915
|
-
task: "unknown",
|
|
916
|
-
domain: "unknown",
|
|
917
|
-
languages: [],
|
|
918
|
-
splits: [],
|
|
919
|
-
license: { id: "unknown", category: "unknown", usage_restrictions: [], warnings: [] },
|
|
920
|
-
quality_score: qualityScore,
|
|
921
|
-
download_url: source === "huggingface" ? `https://huggingface.co/datasets/${datasetIdForDownload}` : "",
|
|
922
|
-
total_examples: 0,
|
|
923
|
-
is_structured: false,
|
|
924
|
-
has_target_column: false,
|
|
925
|
-
is_safe_source: true,
|
|
926
|
-
has_personal_data: false,
|
|
927
|
-
is_paywalled: false,
|
|
928
|
-
is_scraped_web_data: false,
|
|
929
|
-
uses_https: true,
|
|
930
|
-
has_train_split: false,
|
|
931
|
-
has_test_split: false,
|
|
932
|
-
has_validation_split: false,
|
|
933
|
-
description_length: 0,
|
|
934
|
-
has_readme: false,
|
|
935
|
-
});
|
|
936
|
-
}
|
|
937
|
-
}
|
|
938
|
-
catch (e) {
|
|
939
|
-
console.error(`[Prepare] Failed to save minimal metadata for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
940
|
-
}
|
|
941
|
-
}
|
|
942
|
-
markPipelineStep("register", "running");
|
|
943
|
-
update({ progress: 85, status_text: "Installing dataset into project..." });
|
|
944
|
-
const installPath = await installService.install(datasetIdForDownload, rawFilePath, outputDir);
|
|
945
|
-
update({ progress: 100, status_text: "Preparation complete!" });
|
|
946
|
-
// Register prepared dataset in local registry for lookup by export/list tools
|
|
947
|
-
try {
|
|
948
|
-
upsertRegistry(datasetIdForDownload, installPath, "completed");
|
|
949
|
-
markPipelineStep("register", "done");
|
|
950
|
-
markStepComplete(datasetIdForDownload, "prepare");
|
|
951
|
-
}
|
|
952
|
-
catch (e) {
|
|
953
|
-
console.error(`[Registry] Failed to write registry for ${datasetIdForDownload}: ${e?.message || e}`);
|
|
954
|
-
markPipelineStep("register", "failed");
|
|
955
|
-
}
|
|
956
|
-
return installPath;
|
|
957
|
-
}
|
|
958
|
-
/**
|
|
959
|
-
* Logic for cleaning a dataset
|
|
960
|
-
*/
|
|
961
|
-
async function handleCleanJob(jobId, datasetId, ops) {
|
|
962
|
-
const update = (updates) => jobManager.updateJob(jobId, updates);
|
|
963
|
-
// Resolve dataset file path from multiple sources
|
|
964
|
-
let filePath;
|
|
965
|
-
// 1. Check registry (most reliable - includes prepared/fused datasets)
|
|
966
|
-
const regEntry = getRegistryEntry(datasetId);
|
|
967
|
-
const regPath = regEntry?.local_path || regEntry?.path;
|
|
968
|
-
if (regPath && fs.existsSync(regPath)) {
|
|
969
|
-
filePath = regPath;
|
|
970
|
-
}
|
|
971
|
-
// 2. Check download status from metadata store
|
|
972
|
-
if (!filePath) {
|
|
973
|
-
const dlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
974
|
-
if (dlStatus?.local_path && fs.existsSync(dlStatus.local_path)) {
|
|
975
|
-
filePath = dlStatus.local_path;
|
|
976
|
-
}
|
|
977
|
-
}
|
|
978
|
-
// 3. Check standard raw data paths
|
|
979
|
-
if (!filePath) {
|
|
980
|
-
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
981
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
982
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
983
|
-
const featherPath = path.join(dataRoot, "data", "raw", `${safeId}.feather`);
|
|
984
|
-
if (fs.existsSync(parquetPath))
|
|
985
|
-
filePath = parquetPath;
|
|
986
|
-
else if (fs.existsSync(csvPath))
|
|
987
|
-
filePath = csvPath;
|
|
988
|
-
else if (fs.existsSync(featherPath))
|
|
989
|
-
filePath = featherPath;
|
|
990
|
-
}
|
|
991
|
-
// 4. Check if it's a direct file path
|
|
992
|
-
if (!filePath && fs.existsSync(datasetId)) {
|
|
993
|
-
filePath = datasetId;
|
|
994
|
-
}
|
|
995
|
-
// 5. Demo fallback
|
|
996
|
-
if (!filePath && datasetId === "demo") {
|
|
997
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
998
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
999
|
-
if (fs.existsSync(demoParquetPath))
|
|
1000
|
-
filePath = demoParquetPath;
|
|
1001
|
-
else if (fs.existsSync(demoCsvPath))
|
|
1002
|
-
filePath = demoCsvPath;
|
|
1003
|
-
}
|
|
1004
|
-
if (!filePath) {
|
|
1005
|
-
throw new Error(`Data file not found for '${datasetId}'. Download the dataset first using download_dataset or prepare_dataset.`);
|
|
1006
|
-
}
|
|
1007
|
-
update({ status_text: "Cleaning dataset..." });
|
|
1008
|
-
const result = await dataCleaner.clean(filePath, ops);
|
|
1009
|
-
if (!result.success)
|
|
1010
|
-
throw new Error(result.error);
|
|
1011
|
-
return result.output_path;
|
|
1012
|
-
}
|
|
1013
|
-
// Create the server
|
|
1014
|
-
const server = new Server({
|
|
1015
|
-
name: "vesper",
|
|
1016
|
-
version: "1.0.0",
|
|
1017
|
-
}, {
|
|
1018
|
-
capabilities: {
|
|
1019
|
-
tools: {},
|
|
1020
|
-
logging: {},
|
|
1021
|
-
},
|
|
1022
|
-
});
|
|
1023
|
-
// List Tools
|
|
1024
|
-
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
1025
|
-
return {
|
|
1026
|
-
tools: [
|
|
1027
|
-
{
|
|
1028
|
-
name: "vesper_search",
|
|
1029
|
-
description: "Search for datasets using natural language. Supports negative keywords (e.g., 'finance -crypto'). Returns formatted results with safety indicators, quality warnings, and source badges.",
|
|
1030
|
-
inputSchema: {
|
|
1031
|
-
type: "object",
|
|
1032
|
-
properties: {
|
|
1033
|
-
query: {
|
|
1034
|
-
type: "string",
|
|
1035
|
-
description: "The search query. Use -term to exclude keywords.",
|
|
1036
|
-
},
|
|
1037
|
-
enable_jit: {
|
|
1038
|
-
type: "boolean",
|
|
1039
|
-
description: "Enable live JIT search when local library results are insufficient (default: false).",
|
|
1040
|
-
},
|
|
1041
|
-
},
|
|
1042
|
-
required: ["query"],
|
|
1043
|
-
},
|
|
1044
|
-
},
|
|
1045
|
-
{
|
|
1046
|
-
name: "unified_dataset_api",
|
|
1047
|
-
description: "Single facade over multiple external dataset providers. Supports provider discovery, dataset search, dataset download, and dataset info through one MCP tool using public access and server-managed credentials when available.",
|
|
1048
|
-
inputSchema: {
|
|
1049
|
-
type: "object",
|
|
1050
|
-
properties: {
|
|
1051
|
-
operation: {
|
|
1052
|
-
type: "string",
|
|
1053
|
-
enum: ["providers", "discover", "download", "info"],
|
|
1054
|
-
description: "Gateway operation to execute.",
|
|
1055
|
-
},
|
|
1056
|
-
source: {
|
|
1057
|
-
type: "string",
|
|
1058
|
-
enum: ["auto", "huggingface", "kaggle", "openml", "dataworld", "s3", "bigquery"],
|
|
1059
|
-
description: "Optional provider selector. Use 'auto' to let Vesper choose a compatible backend.",
|
|
1060
|
-
},
|
|
1061
|
-
query: {
|
|
1062
|
-
type: "string",
|
|
1063
|
-
description: "Dataset discovery query. Required for operation='discover'.",
|
|
1064
|
-
},
|
|
1065
|
-
dataset_id: {
|
|
1066
|
-
type: "string",
|
|
1067
|
-
description: "Dataset identifier or object reference. Required for operation='download' and operation='info'. Supports prefixed ids like 'huggingface:user/dataset' and public S3 URIs like 's3://bucket/key'.",
|
|
1068
|
-
},
|
|
1069
|
-
limit: {
|
|
1070
|
-
type: "number",
|
|
1071
|
-
description: "Max results for operation='discover' (default: 10).",
|
|
1072
|
-
},
|
|
1073
|
-
target_dir: {
|
|
1074
|
-
type: "string",
|
|
1075
|
-
description: "Optional output directory for operation='download'. Defaults to the current working directory when omitted.",
|
|
1076
|
-
},
|
|
1077
|
-
output_dir: {
|
|
1078
|
-
type: "string",
|
|
1079
|
-
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1080
|
-
},
|
|
1081
|
-
public_only: {
|
|
1082
|
-
type: "boolean",
|
|
1083
|
-
description: "When true, discover/info stay on public providers only unless a specific source is requested.",
|
|
1084
|
-
},
|
|
1085
|
-
include_unavailable: {
|
|
1086
|
-
type: "boolean",
|
|
1087
|
-
description: "When true, operation='providers' includes connectors that are scaffolded but not currently configured.",
|
|
1088
|
-
},
|
|
1089
|
-
},
|
|
1090
|
-
required: ["operation"],
|
|
1091
|
-
},
|
|
1092
|
-
},
|
|
1093
|
-
{
|
|
1094
|
-
name: "discover_datasets",
|
|
1095
|
-
description: "Discover datasets from a specific source. Public providers work keylessly; Kaggle and data.world can also be exposed through server-managed credentials.",
|
|
1096
|
-
inputSchema: {
|
|
1097
|
-
type: "object",
|
|
1098
|
-
properties: {
|
|
1099
|
-
query: {
|
|
1100
|
-
type: "string",
|
|
1101
|
-
description: "Search query, e.g. 'credit risk'.",
|
|
1102
|
-
},
|
|
1103
|
-
source: {
|
|
1104
|
-
type: "string",
|
|
1105
|
-
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
1106
|
-
description: "Data source to discover from.",
|
|
1107
|
-
},
|
|
1108
|
-
limit: {
|
|
1109
|
-
type: "number",
|
|
1110
|
-
description: "Max results to return (default: 10).",
|
|
1111
|
-
},
|
|
1112
|
-
},
|
|
1113
|
-
required: ["query"],
|
|
1114
|
-
},
|
|
1115
|
-
},
|
|
1116
|
-
{
|
|
1117
|
-
name: "download_dataset",
|
|
1118
|
-
description: "Download a dataset by source and ID/slug into a local directory. Defaults to HuggingFace. Public providers work keylessly, while Kaggle and data.world can use server-managed credentials when configured.",
|
|
1119
|
-
inputSchema: {
|
|
1120
|
-
type: "object",
|
|
1121
|
-
properties: {
|
|
1122
|
-
source: {
|
|
1123
|
-
type: "string",
|
|
1124
|
-
enum: ["huggingface", "kaggle", "openml", "dataworld"],
|
|
1125
|
-
description: "Dataset source (default: huggingface). HuggingFace and OpenML work without credentials.",
|
|
1126
|
-
},
|
|
1127
|
-
dataset_id: {
|
|
1128
|
-
type: "string",
|
|
1129
|
-
description: "Dataset ID/slug (e.g. user/dataset for Kaggle or HF).",
|
|
1130
|
-
},
|
|
1131
|
-
target_dir: {
|
|
1132
|
-
type: "string",
|
|
1133
|
-
description: "Optional target directory for downloaded files. Defaults to the current working directory when omitted.",
|
|
1134
|
-
},
|
|
1135
|
-
output_dir: {
|
|
1136
|
-
type: "string",
|
|
1137
|
-
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1138
|
-
}
|
|
1139
|
-
},
|
|
1140
|
-
required: ["dataset_id"],
|
|
1141
|
-
},
|
|
1142
|
-
},
|
|
1143
|
-
{
|
|
1144
|
-
name: "vesper_download_assets",
|
|
1145
|
-
description: "Download real image/media assets using smart source-aware recipes (HuggingFace, Kaggle, direct URL). Auto-detects image columns from HF feature types, column names, and value patterns. Supports PIL Images, URL-based images, and binary image data.",
|
|
1146
|
-
inputSchema: {
|
|
1147
|
-
type: "object",
|
|
1148
|
-
properties: {
|
|
1149
|
-
dataset_id: { type: "string", description: "Unique dataset identifier (e.g. 'user/dataset')." },
|
|
1150
|
-
source: { type: "string", enum: ["huggingface", "kaggle", "url"], description: "Asset source type." },
|
|
1151
|
-
repo_id: { type: "string", description: "Repo ID for HuggingFace (e.g. 'user/dataset'). Auto-inferred from dataset_id if omitted." },
|
|
1152
|
-
kaggle_ref: { type: "string", description: "Kaggle dataset ref (owner/dataset)." },
|
|
1153
|
-
urls: { type: "array", items: { type: "string" }, description: "Direct asset URLs." },
|
|
1154
|
-
output_format: { type: "string", enum: ["webdataset", "imagefolder", "parquet"], description: "Output asset format." },
|
|
1155
|
-
target_dir: { type: "string", description: "Optional local directory where downloaded assets should be written. If provided, Vesper writes directly to this directory instead of managed asset storage." },
|
|
1156
|
-
output_dir: { type: "string", description: "Alias for target_dir. When provided, downloaded assets are written directly to this local directory." },
|
|
1157
|
-
max_items: { type: "number", description: "Optional cap on number of assets to fetch." },
|
|
1158
|
-
workers: { type: "number", description: "Parallel worker count (default 8)." },
|
|
1159
|
-
image_column: { type: "string", description: "Explicit image column name. If omitted, auto-detected from HF features, column names, and sample values." },
|
|
1160
|
-
},
|
|
1161
|
-
required: ["dataset_id", "source"],
|
|
1162
|
-
},
|
|
1163
|
-
},
|
|
1164
|
-
{
|
|
1165
|
-
name: "configure_kaggle",
|
|
1166
|
-
description: "Optionally store Kaggle API credentials for Kaggle discover/download. Core Vesper works without this.",
|
|
1167
|
-
inputSchema: {
|
|
1168
|
-
type: "object",
|
|
1169
|
-
properties: {
|
|
1170
|
-
username: { type: "string", description: "Kaggle username" },
|
|
1171
|
-
key: { type: "string", description: "Kaggle API key" }
|
|
1172
|
-
},
|
|
1173
|
-
required: ["username", "key"],
|
|
1174
|
-
},
|
|
1175
|
-
},
|
|
1176
|
-
{
|
|
1177
|
-
name: "configure_keys",
|
|
1178
|
-
description: "One-time optional key setup for external sources (Kaggle, data.world, gated HF). Core tools do not require keys.",
|
|
1179
|
-
inputSchema: {
|
|
1180
|
-
type: "object",
|
|
1181
|
-
properties: {
|
|
1182
|
-
hf_token: { type: "string", description: "Optional Hugging Face token for gated/private datasets" },
|
|
1183
|
-
kaggle_username: { type: "string", description: "Optional Kaggle username" },
|
|
1184
|
-
kaggle_key: { type: "string", description: "Optional Kaggle API key" },
|
|
1185
|
-
dataworld_token: { type: "string", description: "Optional data.world API token" }
|
|
1186
|
-
},
|
|
1187
|
-
},
|
|
1188
|
-
},
|
|
1189
|
-
{
|
|
1190
|
-
name: "get_dataset_info",
|
|
1191
|
-
description: "Get detailed metadata for a specific dataset by its ID. Returns comprehensive information including license, safety flags, and data characteristics.",
|
|
1192
|
-
inputSchema: {
|
|
1193
|
-
type: "object",
|
|
1194
|
-
properties: {
|
|
1195
|
-
dataset_id: {
|
|
1196
|
-
type: "string",
|
|
1197
|
-
description: "The unique dataset ID (e.g., 'user/dataset_name' for HuggingFace, 'kaggle:username/dataset' for Kaggle, 'openml:1234' for OpenML, or 'dataworld:owner/id' for data.world)",
|
|
1198
|
-
},
|
|
1199
|
-
},
|
|
1200
|
-
required: ["dataset_id"],
|
|
1201
|
-
},
|
|
1202
|
-
},
|
|
1203
|
-
{
|
|
1204
|
-
name: "analyze_quality",
|
|
1205
|
-
description: "Perform a deep quality check on a dataset. Returns a detailed report including duplicates, outliers, and schema issues.",
|
|
1206
|
-
inputSchema: {
|
|
1207
|
-
type: "object",
|
|
1208
|
-
properties: {
|
|
1209
|
-
dataset_id: {
|
|
1210
|
-
type: "string",
|
|
1211
|
-
description: "The dataset ID to analyze.",
|
|
1212
|
-
},
|
|
1213
|
-
},
|
|
1214
|
-
required: ["dataset_id"],
|
|
1215
|
-
},
|
|
1216
|
-
},
|
|
1217
|
-
{
|
|
1218
|
-
name: "preview_cleaning",
|
|
1219
|
-
description: "Dry-run of the cleaning plan. Shows what WOULD be cleaned without modifying data.",
|
|
1220
|
-
inputSchema: {
|
|
1221
|
-
type: "object",
|
|
1222
|
-
properties: {
|
|
1223
|
-
dataset_id: {
|
|
1224
|
-
type: "string",
|
|
1225
|
-
description: "The dataset ID to preview cleaning for.",
|
|
1226
|
-
},
|
|
1227
|
-
},
|
|
1228
|
-
required: ["dataset_id"],
|
|
1229
|
-
},
|
|
1230
|
-
},
|
|
1231
|
-
{
|
|
1232
|
-
name: "custom_clean",
|
|
1233
|
-
description: "Apply specific cleaning operations to a dataset as an asynchronous job. Supports: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories. The dataset must be downloaded first.",
|
|
1234
|
-
inputSchema: {
|
|
1235
|
-
type: "object",
|
|
1236
|
-
properties: {
|
|
1237
|
-
dataset_id: {
|
|
1238
|
-
type: "string",
|
|
1239
|
-
description: "The dataset ID to clean.",
|
|
1240
|
-
},
|
|
1241
|
-
operations: {
|
|
1242
|
-
type: "array",
|
|
1243
|
-
items: {
|
|
1244
|
-
type: "object",
|
|
1245
|
-
properties: {
|
|
1246
|
-
type: { type: "string" },
|
|
1247
|
-
params: { type: "object" },
|
|
1248
|
-
reason: { type: "string" },
|
|
1249
|
-
},
|
|
1250
|
-
},
|
|
1251
|
-
},
|
|
1252
|
-
},
|
|
1253
|
-
required: ["dataset_id", "operations"],
|
|
1254
|
-
},
|
|
1255
|
-
},
|
|
1256
|
-
{
|
|
1257
|
-
name: "prepare_dataset",
|
|
1258
|
-
description: "Full pipeline: Search, Download, Analyze, Clean, Split, and Install a dataset as an asynchronous job. Automatically selects the best available source (prefers HuggingFace/OpenML when no Kaggle credentials are set). Use check_job_status to monitor progress.",
|
|
1259
|
-
inputSchema: {
|
|
1260
|
-
type: "object",
|
|
1261
|
-
properties: {
|
|
1262
|
-
query: { type: "string" },
|
|
1263
|
-
requirements: { type: "string" },
|
|
1264
|
-
target_dir: { type: "string", description: "Optional local directory for the prepared dataset. Defaults to the current working directory when omitted." },
|
|
1265
|
-
output_dir: { type: "string", description: "Alias for target_dir. Defaults to the current working directory when omitted." },
|
|
1266
|
-
download_images: { type: "boolean", description: "When true, enables post-prepare smart asset downloading for image/media datasets." },
|
|
1267
|
-
cleaning_options: { type: "object" },
|
|
1268
|
-
split_config: { type: "object" },
|
|
1269
|
-
},
|
|
1270
|
-
required: ["query"],
|
|
1271
|
-
},
|
|
1272
|
-
},
|
|
1273
|
-
{
|
|
1274
|
-
name: "compare_datasets",
|
|
1275
|
-
description: "Compare 2-3 datasets side-by-side.",
|
|
1276
|
-
inputSchema: {
|
|
1277
|
-
type: "object",
|
|
1278
|
-
properties: {
|
|
1279
|
-
dataset_ids: {
|
|
1280
|
-
type: "array",
|
|
1281
|
-
items: { type: "string" },
|
|
1282
|
-
},
|
|
1283
|
-
},
|
|
1284
|
-
required: ["dataset_ids"],
|
|
1285
|
-
},
|
|
1286
|
-
},
|
|
1287
|
-
{
|
|
1288
|
-
name: "check_job_status",
|
|
1289
|
-
description: "Check the status of an asynchronous job.",
|
|
1290
|
-
inputSchema: {
|
|
1291
|
-
type: "object",
|
|
1292
|
-
properties: {
|
|
1293
|
-
job_id: { type: "string" },
|
|
1294
|
-
},
|
|
1295
|
-
required: ["job_id"],
|
|
1296
|
-
},
|
|
1297
|
-
},
|
|
1298
|
-
{
|
|
1299
|
-
name: "export_dataset",
|
|
1300
|
-
description: "Export a dataset to a local directory. Use format='parquet' (default) for efficient analytics and broad interoperability. Add fast=true to skip quality/cleaning steps.",
|
|
1301
|
-
inputSchema: {
|
|
1302
|
-
type: "object",
|
|
1303
|
-
properties: {
|
|
1304
|
-
dataset_id: {
|
|
1305
|
-
type: "string",
|
|
1306
|
-
description: "The unique dataset ID.",
|
|
1307
|
-
},
|
|
1308
|
-
target_dir: {
|
|
1309
|
-
type: "string",
|
|
1310
|
-
description: "Optional custom local directory for export. Defaults to the current working directory when omitted.",
|
|
1311
|
-
},
|
|
1312
|
-
output_dir: {
|
|
1313
|
-
type: "string",
|
|
1314
|
-
description: "Alias for target_dir. Defaults to the current working directory when omitted.",
|
|
1315
|
-
},
|
|
1316
|
-
format: {
|
|
1317
|
-
type: "string",
|
|
1318
|
-
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
1319
|
-
description: "Output format. parquet (default, analytics-friendly), feather (fast local IO), csv (human-readable).",
|
|
1320
|
-
},
|
|
1321
|
-
compression: {
|
|
1322
|
-
type: "string",
|
|
1323
|
-
enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
|
|
1324
|
-
description: "Compression algorithm. Default: lz4 for feather, snappy for parquet, none for csv.",
|
|
1325
|
-
},
|
|
1326
|
-
fast: {
|
|
1327
|
-
type: "boolean",
|
|
1328
|
-
description: "Skip quality analysis and cleaning – raw export only. Much faster. Default: false.",
|
|
1329
|
-
},
|
|
1330
|
-
preview: {
|
|
1331
|
-
type: "boolean",
|
|
1332
|
-
description: "Generate a small 500-row CSV preview alongside binary exports. Default: false.",
|
|
1333
|
-
},
|
|
1334
|
-
sample_rows: {
|
|
1335
|
-
type: "number",
|
|
1336
|
-
description: "Export only this many random rows (faster for huge datasets).",
|
|
1337
|
-
},
|
|
1338
|
-
columns: {
|
|
1339
|
-
type: "array",
|
|
1340
|
-
items: { type: "string" },
|
|
1341
|
-
description: "Export only these columns (faster for wide datasets).",
|
|
1342
|
-
},
|
|
1343
|
-
},
|
|
1344
|
-
required: ["dataset_id"],
|
|
1345
|
-
},
|
|
1346
|
-
},
|
|
1347
|
-
{
|
|
1348
|
-
name: "vesper_list_datasets",
|
|
1349
|
-
description: "List local prepared datasets from the Vesper registry (dataset_id and local_path).",
|
|
1350
|
-
inputSchema: {
|
|
1351
|
-
type: "object",
|
|
1352
|
-
properties: {},
|
|
1353
|
-
},
|
|
1354
|
-
},
|
|
1355
|
-
{
|
|
1356
|
-
name: "vesper_convert_format",
|
|
1357
|
-
description: "Convert a dataset file between formats (CSV, Parquet, JSON, JSONL). Auto-detects input format from extension. Saves output in the same directory with the new extension and registers it in the Vesper registry.",
|
|
1358
|
-
inputSchema: {
|
|
1359
|
-
type: "object",
|
|
1360
|
-
properties: {
|
|
1361
|
-
file_path: {
|
|
1362
|
-
type: "string",
|
|
1363
|
-
description: "Absolute path to the input dataset file.",
|
|
1364
|
-
},
|
|
1365
|
-
target_format: {
|
|
1366
|
-
type: "string",
|
|
1367
|
-
enum: ["csv", "parquet", "json", "jsonl"],
|
|
1368
|
-
description: "The desired output format.",
|
|
1369
|
-
},
|
|
1370
|
-
},
|
|
1371
|
-
required: ["file_path", "target_format"],
|
|
1372
|
-
},
|
|
1373
|
-
},
|
|
1374
|
-
{
|
|
1375
|
-
name: "fuse_datasets",
|
|
1376
|
-
description: "Fuse/combine multiple datasets via concat or join. Optionally runs quality & leakage checks afterward.",
|
|
1377
|
-
inputSchema: {
|
|
1378
|
-
type: "object",
|
|
1379
|
-
properties: {
|
|
1380
|
-
sources: {
|
|
1381
|
-
type: "array",
|
|
1382
|
-
items: { type: "string" },
|
|
1383
|
-
description: "List of dataset IDs and/or local file paths to fuse.",
|
|
1384
|
-
},
|
|
1385
|
-
strategy: {
|
|
1386
|
-
type: "string",
|
|
1387
|
-
enum: ["concat", "join"],
|
|
1388
|
-
description: "Fusion strategy. concat appends rows; join merges on key(s).",
|
|
1389
|
-
},
|
|
1390
|
-
join_on: {
|
|
1391
|
-
oneOf: [
|
|
1392
|
-
{ type: "string" },
|
|
1393
|
-
{ type: "array", items: { type: "string" } }
|
|
1394
|
-
],
|
|
1395
|
-
description: "Join key(s). Required when strategy='join'.",
|
|
1396
|
-
},
|
|
1397
|
-
how: {
|
|
1398
|
-
type: "string",
|
|
1399
|
-
enum: ["inner", "left", "outer"],
|
|
1400
|
-
description: "Join mode (only for strategy='join').",
|
|
1401
|
-
},
|
|
1402
|
-
dedup: {
|
|
1403
|
-
type: "boolean",
|
|
1404
|
-
description: "Drop exact duplicate rows after fusion.",
|
|
1405
|
-
},
|
|
1406
|
-
run_quality_after: {
|
|
1407
|
-
type: "boolean",
|
|
1408
|
-
description: "Run quality analysis on the fused output.",
|
|
1409
|
-
},
|
|
1410
|
-
leakage_check: {
|
|
1411
|
-
type: "boolean",
|
|
1412
|
-
description: "Run leakage/overlap checks across fused sources.",
|
|
1413
|
-
},
|
|
1414
|
-
output_format: {
|
|
1415
|
-
type: "string",
|
|
1416
|
-
enum: ["feather", "parquet", "csv", "jsonl", "arrow"],
|
|
1417
|
-
description: "Output format (default: parquet).",
|
|
1418
|
-
},
|
|
1419
|
-
compression: {
|
|
1420
|
-
type: "string",
|
|
1421
|
-
enum: ["lz4", "zstd", "snappy", "gzip", "uncompressed"],
|
|
1422
|
-
description: "Compression algorithm for binary outputs.",
|
|
1423
|
-
},
|
|
1424
|
-
preview: {
|
|
1425
|
-
type: "boolean",
|
|
1426
|
-
description: "Generate a small preview CSV of fused output.",
|
|
1427
|
-
},
|
|
1428
|
-
},
|
|
1429
|
-
required: ["sources"],
|
|
1430
|
-
},
|
|
1431
|
-
},
|
|
1432
|
-
{
|
|
1433
|
-
name: "analyze_image_quality",
|
|
1434
|
-
description: "Analyze image quality (resolution, blur, corruption) for a folder or single image.",
|
|
1435
|
-
inputSchema: {
|
|
1436
|
-
type: "object",
|
|
1437
|
-
properties: {
|
|
1438
|
-
path: {
|
|
1439
|
-
type: "string",
|
|
1440
|
-
description: "Absolute path to the image file or folder.",
|
|
1441
|
-
},
|
|
1442
|
-
},
|
|
1443
|
-
required: ["path"],
|
|
1444
|
-
},
|
|
1445
|
-
},
|
|
1446
|
-
{
|
|
1447
|
-
name: "analyze_media_quality",
|
|
1448
|
-
description: "Analyze audio/video quality (sample rate, duration, FPS, corruption) for a folder or single file.",
|
|
1449
|
-
inputSchema: {
|
|
1450
|
-
type: "object",
|
|
1451
|
-
properties: {
|
|
1452
|
-
path: {
|
|
1453
|
-
type: "string",
|
|
1454
|
-
description: "Absolute path to the audio/video file or folder.",
|
|
1455
|
-
},
|
|
1456
|
-
},
|
|
1457
|
-
required: ["path"],
|
|
1458
|
-
},
|
|
1459
|
-
},
|
|
1460
|
-
{
|
|
1461
|
-
name: "generate_quality_report",
|
|
1462
|
-
description: "Generate a comprehensive unified quality report for a multimodal dataset (text, image, audio, video).",
|
|
1463
|
-
inputSchema: {
|
|
1464
|
-
type: "object",
|
|
1465
|
-
properties: {
|
|
1466
|
-
dataset_id: {
|
|
1467
|
-
type: "string",
|
|
1468
|
-
description: "Dataset identifier.",
|
|
1469
|
-
},
|
|
1470
|
-
dataset_path: {
|
|
1471
|
-
type: "string",
|
|
1472
|
-
description: "Absolute path to the dataset directory.",
|
|
1473
|
-
},
|
|
1474
|
-
},
|
|
1475
|
-
required: ["dataset_id", "dataset_path"],
|
|
1476
|
-
},
|
|
1477
|
-
},
|
|
1478
|
-
],
|
|
1479
|
-
};
|
|
1480
|
-
});
|
|
1481
|
-
// Call Tool — all requests are serialized through a queue to prevent crashes from parallel calls
|
|
1482
|
-
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
1483
|
-
return requestQueue.enqueue(async () => {
|
|
1484
|
-
// --- Pipeline Enforcement ---
|
|
1485
|
-
// Map tool names to pipeline steps
|
|
1486
|
-
const toolToStep = {
|
|
1487
|
-
vesper_search: "search",
|
|
1488
|
-
vesper_download: "download",
|
|
1489
|
-
vesper_analyze: "analyze",
|
|
1490
|
-
vesper_clean: "clean",
|
|
1491
|
-
vesper_split: "split",
|
|
1492
|
-
vesper_export: "export",
|
|
1493
|
-
prepare_dataset: "prepare",
|
|
1494
|
-
};
|
|
1495
|
-
// Extract dataset_id if present and normalize
|
|
1496
|
-
let datasetId = request.params.arguments?.dataset_id || request.params.arguments?.query || "";
|
|
1497
|
-
if (datasetId)
|
|
1498
|
-
datasetId = parseDatasetId(String(datasetId));
|
|
1499
|
-
// Pipeline rules
|
|
1500
|
-
const stepOrder = ["search", "download", "analyze", "clean", "split", "export"];
|
|
1501
|
-
const prereqs = {
|
|
1502
|
-
vesper_download: ["search"],
|
|
1503
|
-
vesper_analyze: ["download"],
|
|
1504
|
-
vesper_clean: ["analyze"],
|
|
1505
|
-
vesper_split: ["clean"],
|
|
1506
|
-
vesper_export: ["split"],
|
|
1507
|
-
};
|
|
1508
|
-
const tool = String(request.params.name);
|
|
1509
|
-
const step = toolToStep[tool];
|
|
1510
|
-
if (step && datasetId) {
|
|
1511
|
-
// Check prerequisites
|
|
1512
|
-
const required = prereqs[tool] || [];
|
|
1513
|
-
for (const req of required) {
|
|
1514
|
-
if (!hasStep(String(datasetId), req)) {
|
|
1515
|
-
// Auto-run missing step if possible, else error
|
|
1516
|
-
// For export, auto-run prepare_dataset if split missing
|
|
1517
|
-
if (tool === "vesper_export" && req === "split") {
|
|
1518
|
-
// Auto-trigger prepare_dataset (start a background prepare job)
|
|
1519
|
-
try {
|
|
1520
|
-
jobManager.createJob("prepare", 0, { query: String(datasetId), requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
1521
|
-
// Mark split as complete so export can proceed; export handler will also wait for data if needed.
|
|
1522
|
-
markStepComplete(String(datasetId), "split");
|
|
1523
|
-
}
|
|
1524
|
-
catch (e) {
|
|
1525
|
-
console.error(`[Pipeline] Failed to auto-trigger prepare for ${datasetId}: ${e?.message || e}`);
|
|
1526
|
-
return {
|
|
1527
|
-
content: [{ type: "text", text: `ERROR: Failed to auto-run prepare for ${datasetId}. Please run prepare_dataset first.` }],
|
|
1528
|
-
isError: true,
|
|
1529
|
-
};
|
|
1530
|
-
}
|
|
1531
|
-
}
|
|
1532
|
-
else {
|
|
1533
|
-
return {
|
|
1534
|
-
content: [{ type: "text", text: `ERROR: Cannot run ${tool} before ${req}. Please run ${req} first.` }],
|
|
1535
|
-
isError: true,
|
|
1536
|
-
};
|
|
1537
|
-
}
|
|
1538
|
-
}
|
|
1539
|
-
}
|
|
1540
|
-
// Mark this step as complete
|
|
1541
|
-
markStepComplete(String(datasetId), String(step));
|
|
1542
|
-
}
|
|
1543
|
-
switch (request.params.name) {
|
|
1544
|
-
case "unified_dataset_api": {
|
|
1545
|
-
hydrateExternalKeys();
|
|
1546
|
-
const operation = String(request.params.arguments?.operation || "").trim().toLowerCase();
|
|
1547
|
-
const source = String(request.params.arguments?.source || "auto").trim().toLowerCase();
|
|
1548
|
-
const includeUnavailable = request.params.arguments?.include_unavailable === true;
|
|
1549
|
-
const publicOnly = request.params.arguments?.public_only !== false;
|
|
1550
|
-
try {
|
|
1551
|
-
if (operation === "providers") {
|
|
1552
|
-
return {
|
|
1553
|
-
content: [{ type: "text", text: JSON.stringify({ providers: unifiedDatasetGateway.getProviderStatuses(includeUnavailable) }, null, 2) }],
|
|
1554
|
-
};
|
|
1555
|
-
}
|
|
1556
|
-
if (operation === "discover") {
|
|
1557
|
-
const query = String(request.params.arguments?.query || "").trim();
|
|
1558
|
-
if (!query) {
|
|
1559
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required for operation='discover'");
|
|
1560
|
-
}
|
|
1561
|
-
const result = await unifiedDatasetGateway.discover({
|
|
1562
|
-
query,
|
|
1563
|
-
source,
|
|
1564
|
-
limit: Number(request.params.arguments?.limit || 10),
|
|
1565
|
-
publicOnly,
|
|
1566
|
-
});
|
|
1567
|
-
return {
|
|
1568
|
-
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1569
|
-
};
|
|
1570
|
-
}
|
|
1571
|
-
if (operation === "download") {
|
|
1572
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1573
|
-
if (!datasetId) {
|
|
1574
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='download'");
|
|
1575
|
-
}
|
|
1576
|
-
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1577
|
-
? String(request.params.arguments.target_dir).trim()
|
|
1578
|
-
: request.params.arguments?.output_dir
|
|
1579
|
-
? String(request.params.arguments.output_dir).trim()
|
|
1580
|
-
: "";
|
|
1581
|
-
const targetDir = requestedTargetDir || process.cwd();
|
|
1582
|
-
try {
|
|
1583
|
-
await ensurePythonModules([{ module: "datasets", packageName: "datasets" }]);
|
|
1584
|
-
}
|
|
1585
|
-
catch {
|
|
1586
|
-
// best effort; non-HF providers do not require this
|
|
1587
|
-
}
|
|
1588
|
-
const result = await unifiedDatasetGateway.download({
|
|
1589
|
-
datasetId,
|
|
1590
|
-
source,
|
|
1591
|
-
targetDir,
|
|
1592
|
-
});
|
|
1593
|
-
try {
|
|
1594
|
-
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
1595
|
-
}
|
|
1596
|
-
catch (e) {
|
|
1597
|
-
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1598
|
-
}
|
|
1599
|
-
return {
|
|
1600
|
-
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1601
|
-
};
|
|
1602
|
-
}
|
|
1603
|
-
if (operation === "info") {
|
|
1604
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1605
|
-
if (!datasetId) {
|
|
1606
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required for operation='info'");
|
|
1607
|
-
}
|
|
1608
|
-
const result = await unifiedDatasetGateway.info({
|
|
1609
|
-
datasetId,
|
|
1610
|
-
source,
|
|
1611
|
-
publicOnly,
|
|
1612
|
-
});
|
|
1613
|
-
return {
|
|
1614
|
-
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
1615
|
-
};
|
|
1616
|
-
}
|
|
1617
|
-
throw new McpError(ErrorCode.InvalidParams, `Unsupported operation: ${operation}`);
|
|
1618
|
-
}
|
|
1619
|
-
catch (error) {
|
|
1620
|
-
return {
|
|
1621
|
-
content: [{ type: "text", text: `ERROR: unified gateway failed: ${error.message}` }],
|
|
1622
|
-
isError: true,
|
|
1623
|
-
};
|
|
1624
|
-
}
|
|
1625
|
-
}
|
|
1626
|
-
case "vesper_search": {
|
|
1627
|
-
const query = String(request.params.arguments?.query);
|
|
1628
|
-
const limit = 5;
|
|
1629
|
-
const safeOnly = true; // Enable safe filter by default
|
|
1630
|
-
const enableJIT = request.params.arguments?.enable_jit === true;
|
|
1631
|
-
if (!query) {
|
|
1632
|
-
throw new McpError(ErrorCode.InvalidParams, "Query is required");
|
|
1633
|
-
}
|
|
1634
|
-
const results = await searchEngine.search(query, { limit, safeOnly, enableJIT });
|
|
1635
|
-
const formattedOutput = formatSearchResults(results);
|
|
1636
|
-
return {
|
|
1637
|
-
content: [
|
|
1638
|
-
{
|
|
1639
|
-
type: "text",
|
|
1640
|
-
text: formattedOutput,
|
|
1641
|
-
},
|
|
1642
|
-
],
|
|
1643
|
-
};
|
|
1644
|
-
}
|
|
1645
|
-
case "discover_datasets": {
|
|
1646
|
-
hydrateExternalKeys();
|
|
1647
|
-
const query = String(request.params.arguments?.query || "").trim();
|
|
1648
|
-
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1649
|
-
const limit = Number(request.params.arguments?.limit || 10);
|
|
1650
|
-
if (!query) {
|
|
1651
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required");
|
|
1652
|
-
}
|
|
1653
|
-
try {
|
|
1654
|
-
const gatewayResult = await unifiedDatasetGateway.discover({
|
|
1655
|
-
query,
|
|
1656
|
-
source,
|
|
1657
|
-
limit,
|
|
1658
|
-
publicOnly: false,
|
|
1659
|
-
});
|
|
1660
|
-
const results = gatewayResult.results;
|
|
1661
|
-
const recipeScript = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1662
|
-
for (const ds of results.slice(0, limit)) {
|
|
1663
|
-
const info = {
|
|
1664
|
-
dataset_id: ds.id,
|
|
1665
|
-
id: ds.id,
|
|
1666
|
-
source: ds.source,
|
|
1667
|
-
repo_id: ds.id,
|
|
1668
|
-
total_images: ds.total_examples || 0,
|
|
1669
|
-
image_column: undefined,
|
|
1670
|
-
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1671
|
-
};
|
|
1672
|
-
try {
|
|
1673
|
-
await runPythonJson(recipeScript, ["build_recipe", JSON.stringify(info)]);
|
|
1674
|
-
}
|
|
1675
|
-
catch {
|
|
1676
|
-
// best-effort recipe generation; ignore discovery-time recipe failures
|
|
1677
|
-
}
|
|
1678
|
-
}
|
|
1679
|
-
const formattedOutput = formatSearchResults(results.slice(0, limit));
|
|
1680
|
-
const noteBlock = gatewayResult.notes.length > 0
|
|
1681
|
-
? `\nGateway notes:\n- ${gatewayResult.notes.join("\n- ")}`
|
|
1682
|
-
: "";
|
|
1683
|
-
return {
|
|
1684
|
-
content: [{ type: "text", text: `${formattedOutput}${noteBlock}` }]
|
|
1685
|
-
};
|
|
1686
|
-
}
|
|
1687
|
-
catch (error) {
|
|
1688
|
-
return {
|
|
1689
|
-
content: [{ type: "text", text: `ERROR: discover failed: ${error.message}` }],
|
|
1690
|
-
isError: true,
|
|
1691
|
-
};
|
|
1692
|
-
}
|
|
1693
|
-
}
|
|
1694
|
-
case "download_dataset": {
|
|
1695
|
-
hydrateExternalKeys();
|
|
1696
|
-
const source = String(request.params.arguments?.source || "huggingface").toLowerCase();
|
|
1697
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1698
|
-
const requestedTargetDir = request.params.arguments?.target_dir
|
|
1699
|
-
? String(request.params.arguments.target_dir).trim()
|
|
1700
|
-
: request.params.arguments?.output_dir
|
|
1701
|
-
? String(request.params.arguments.output_dir).trim()
|
|
1702
|
-
: "";
|
|
1703
|
-
const targetDir = requestedTargetDir || process.cwd();
|
|
1704
|
-
if (!datasetId) {
|
|
1705
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1706
|
-
}
|
|
1707
|
-
// Pre-install Python datasets library for HuggingFace fallback
|
|
1708
|
-
if (source === "huggingface") {
|
|
1709
|
-
try {
|
|
1710
|
-
await ensurePythonModules([
|
|
1711
|
-
{ module: "datasets", packageName: "datasets" },
|
|
1712
|
-
]);
|
|
1713
|
-
}
|
|
1714
|
-
catch {
|
|
1715
|
-
// Continue - direct download may still work
|
|
1716
|
-
}
|
|
1717
|
-
}
|
|
1718
|
-
try {
|
|
1719
|
-
const result = await unifiedDatasetGateway.download({
|
|
1720
|
-
datasetId,
|
|
1721
|
-
source,
|
|
1722
|
-
targetDir,
|
|
1723
|
-
});
|
|
1724
|
-
try {
|
|
1725
|
-
upsertRegistry(result.dataset_id, result.copied_to || result.local_path, "completed");
|
|
1726
|
-
}
|
|
1727
|
-
catch (e) {
|
|
1728
|
-
console.error(`[Registry] Failed to write registry for ${result.dataset_id}: ${e?.message || e}`);
|
|
1729
|
-
}
|
|
1730
|
-
const noteBlock = result.notes.length > 0 ? `\nNotes:\n- ${result.notes.join("\n- ")}` : "";
|
|
1731
|
-
return {
|
|
1732
|
-
content: [{ type: "text", text: `Download complete: ${result.copied_to || result.local_path}${noteBlock}` }]
|
|
1733
|
-
};
|
|
1734
|
-
}
|
|
1735
|
-
catch (error) {
|
|
1736
|
-
return {
|
|
1737
|
-
content: [{ type: "text", text: `ERROR: download failed: ${error.message}` }],
|
|
1738
|
-
isError: true,
|
|
1739
|
-
};
|
|
1740
|
-
}
|
|
1741
|
-
}
|
|
1742
|
-
case "vesper_download_assets": {
|
|
1743
|
-
hydrateExternalKeys();
|
|
1744
|
-
const datasetId = String(request.params.arguments?.dataset_id || "").trim();
|
|
1745
|
-
const source = String(request.params.arguments?.source || "").trim().toLowerCase();
|
|
1746
|
-
// Auto-infer repo_id from dataset_id if not provided (common for HuggingFace)
|
|
1747
|
-
const repoId = request.params.arguments?.repo_id
|
|
1748
|
-
? String(request.params.arguments.repo_id)
|
|
1749
|
-
: (source === "huggingface" && datasetId.includes("/") ? datasetId : undefined);
|
|
1750
|
-
const kaggleRef = request.params.arguments?.kaggle_ref ? String(request.params.arguments.kaggle_ref) : undefined;
|
|
1751
|
-
const urls = Array.isArray(request.params.arguments?.urls)
|
|
1752
|
-
? (request.params.arguments?.urls).map(v => String(v))
|
|
1753
|
-
: undefined;
|
|
1754
|
-
const outputFormat = String(request.params.arguments?.output_format || "webdataset");
|
|
1755
|
-
const requestedOutputDir = request.params.arguments?.target_dir
|
|
1756
|
-
? String(request.params.arguments.target_dir).trim()
|
|
1757
|
-
: request.params.arguments?.output_dir
|
|
1758
|
-
? String(request.params.arguments.output_dir).trim()
|
|
1759
|
-
: undefined;
|
|
1760
|
-
const maxItems = request.params.arguments?.max_items ? Number(request.params.arguments.max_items) : undefined;
|
|
1761
|
-
const workers = request.params.arguments?.workers ? Number(request.params.arguments.workers) : 8;
|
|
1762
|
-
const imageColumn = request.params.arguments?.image_column ? String(request.params.arguments.image_column) : undefined;
|
|
1763
|
-
if (!datasetId || !source) {
|
|
1764
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id and source are required");
|
|
1765
|
-
}
|
|
1766
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
1767
|
-
return {
|
|
1768
|
-
content: [{ type: "text", text: "Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds)." }],
|
|
1769
|
-
isError: true,
|
|
1770
|
-
};
|
|
1771
|
-
}
|
|
1772
|
-
const requiredModules = [
|
|
1773
|
-
{ module: "aiohttp", packageName: "aiohttp" },
|
|
1774
|
-
];
|
|
1775
|
-
if (source === "url") {
|
|
1776
|
-
requiredModules.push({ module: "aiofiles", packageName: "aiofiles" });
|
|
1777
|
-
}
|
|
1778
|
-
if (source === "huggingface") {
|
|
1779
|
-
requiredModules.push({ module: "datasets", packageName: "datasets" });
|
|
1780
|
-
requiredModules.push({ module: "PIL", packageName: "Pillow" });
|
|
1781
|
-
}
|
|
1782
|
-
if (source === "kaggle") {
|
|
1783
|
-
requiredModules.push({ module: "kaggle", packageName: "kaggle" });
|
|
1784
|
-
}
|
|
1785
|
-
try {
|
|
1786
|
-
await ensurePythonModules(requiredModules);
|
|
1787
|
-
}
|
|
1788
|
-
catch (error) {
|
|
1789
|
-
return {
|
|
1790
|
-
content: [{ type: "text", text: `ERROR: Python dependency setup failed: ${error.message}` }],
|
|
1791
|
-
isError: true,
|
|
1792
|
-
};
|
|
1793
|
-
}
|
|
1794
|
-
const scriptPath = path.join(dataRoot, "python", "asset_downloader_engine.py");
|
|
1795
|
-
const payload = {
|
|
1796
|
-
dataset_id: datasetId,
|
|
1797
|
-
source,
|
|
1798
|
-
repo_id: repoId,
|
|
1799
|
-
kaggle_ref: kaggleRef,
|
|
1800
|
-
urls,
|
|
1801
|
-
output_format: outputFormat,
|
|
1802
|
-
output_dir: requestedOutputDir,
|
|
1803
|
-
max_items: maxItems,
|
|
1804
|
-
workers,
|
|
1805
|
-
image_column: imageColumn,
|
|
1806
|
-
output_root: requestedOutputDir || process.cwd(),
|
|
1807
|
-
recipes_dir: path.join(dataRoot, "recipes"),
|
|
1808
|
-
};
|
|
1809
|
-
try {
|
|
1810
|
-
const result = await runPythonJson(scriptPath, ["download", JSON.stringify(payload)]);
|
|
1811
|
-
if (!result?.ok) {
|
|
1812
|
-
const errMsg = result?.error || "Unknown error";
|
|
1813
|
-
// Enhance error messages for common failures
|
|
1814
|
-
let hint = "";
|
|
1815
|
-
if (errMsg.includes("No image column")) {
|
|
1816
|
-
hint = "\n\nHint: Specify image_column parameter with the column name containing images/URLs.";
|
|
1817
|
-
}
|
|
1818
|
-
else if (errMsg.includes("Authentication") || errMsg.includes("401") || errMsg.includes("403")) {
|
|
1819
|
-
hint = "\n\nHint: Use configure_keys tool to set HF_TOKEN for gated/private datasets.";
|
|
1820
|
-
}
|
|
1821
|
-
return {
|
|
1822
|
-
content: [{ type: "text", text: `ERROR: asset download failed: ${errMsg}${hint}` }],
|
|
1823
|
-
isError: true,
|
|
1824
|
-
};
|
|
1825
|
-
}
|
|
1826
|
-
return {
|
|
1827
|
-
content: [{ type: "text", text: JSON.stringify(result.result, null, 2) }],
|
|
1828
|
-
};
|
|
1829
|
-
}
|
|
1830
|
-
catch (error) {
|
|
1831
|
-
return {
|
|
1832
|
-
content: [{ type: "text", text: `ERROR: asset downloader execution failed: ${error.message}` }],
|
|
1833
|
-
isError: true,
|
|
1834
|
-
};
|
|
1835
|
-
}
|
|
1836
|
-
}
|
|
1837
|
-
case "configure_kaggle": {
|
|
1838
|
-
const username = String(request.params.arguments?.username || "").trim();
|
|
1839
|
-
const key = String(request.params.arguments?.key || "").trim();
|
|
1840
|
-
if (!username || !key) {
|
|
1841
|
-
throw new McpError(ErrorCode.InvalidParams, "username and key are required");
|
|
1842
|
-
}
|
|
1843
|
-
const r1 = secureKeys.set("kaggle_username", username);
|
|
1844
|
-
const r2 = secureKeys.set("kaggle_key", key);
|
|
1845
|
-
process.env.KAGGLE_USERNAME = username;
|
|
1846
|
-
process.env.KAGGLE_KEY = key;
|
|
1847
|
-
return {
|
|
1848
|
-
content: [{ type: "text", text: `Kaggle credentials saved securely (${r1.method || "store"}/${r2.method || "store"}). You can now use Kaggle datasets.` }]
|
|
1849
|
-
};
|
|
1850
|
-
}
|
|
1851
|
-
case "configure_keys": {
|
|
1852
|
-
const hfToken = String(request.params.arguments?.hf_token || "").trim();
|
|
1853
|
-
const kaggleUsername = String(request.params.arguments?.kaggle_username || "").trim();
|
|
1854
|
-
const kaggleKey = String(request.params.arguments?.kaggle_key || "").trim();
|
|
1855
|
-
const dataworldToken = String(request.params.arguments?.dataworld_token || "").trim();
|
|
1856
|
-
const saved = [];
|
|
1857
|
-
const methods = [];
|
|
1858
|
-
if (hfToken) {
|
|
1859
|
-
const r = secureKeys.set("hf_token", hfToken);
|
|
1860
|
-
if (r.ok) {
|
|
1861
|
-
process.env.HF_TOKEN = hfToken;
|
|
1862
|
-
saved.push("HF token");
|
|
1863
|
-
if (r.method)
|
|
1864
|
-
methods.push(r.method);
|
|
1865
|
-
}
|
|
1866
|
-
}
|
|
1867
|
-
if (kaggleUsername) {
|
|
1868
|
-
const r = secureKeys.set("kaggle_username", kaggleUsername);
|
|
1869
|
-
if (r.ok) {
|
|
1870
|
-
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
1871
|
-
saved.push("Kaggle username");
|
|
1872
|
-
if (r.method)
|
|
1873
|
-
methods.push(r.method);
|
|
1874
|
-
}
|
|
1875
|
-
}
|
|
1876
|
-
if (kaggleKey) {
|
|
1877
|
-
const r = secureKeys.set("kaggle_key", kaggleKey);
|
|
1878
|
-
if (r.ok) {
|
|
1879
|
-
process.env.KAGGLE_KEY = kaggleKey;
|
|
1880
|
-
saved.push("Kaggle key");
|
|
1881
|
-
if (r.method)
|
|
1882
|
-
methods.push(r.method);
|
|
1883
|
-
}
|
|
1884
|
-
}
|
|
1885
|
-
if (dataworldToken) {
|
|
1886
|
-
const r = secureKeys.set("dataworld_token", dataworldToken);
|
|
1887
|
-
if (r.ok) {
|
|
1888
|
-
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
1889
|
-
saved.push("data.world token");
|
|
1890
|
-
if (r.method)
|
|
1891
|
-
methods.push(r.method);
|
|
1892
|
-
}
|
|
1893
|
-
}
|
|
1894
|
-
if (saved.length === 0) {
|
|
1895
|
-
return {
|
|
1896
|
-
content: [{ type: "text", text: "No keys provided. Core Vesper tools continue to work without API keys." }]
|
|
1897
|
-
};
|
|
1898
|
-
}
|
|
1899
|
-
return {
|
|
1900
|
-
content: [{ type: "text", text: `Key saved securely. Updated: ${saved.join(", ")}.` }]
|
|
1901
|
-
};
|
|
1902
|
-
}
|
|
1903
|
-
case "get_dataset_info": {
|
|
1904
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1905
|
-
if (!datasetId) {
|
|
1906
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
1907
|
-
}
|
|
1908
|
-
const dataset = metadataStore.getDataset(datasetId);
|
|
1909
|
-
if (!dataset) {
|
|
1910
|
-
// Fallback: check the registry for local path info
|
|
1911
|
-
const regEntry = getRegistryEntry(datasetId);
|
|
1912
|
-
const regPath = regEntry?.local_path || regEntry?.path;
|
|
1913
|
-
if (regEntry) {
|
|
1914
|
-
const exists = regPath && fs.existsSync(regPath);
|
|
1915
|
-
return {
|
|
1916
|
-
content: [{ type: "text", text: `**${datasetId}** (from registry)\n- Local path: ${regPath || "unknown"}\n- Status: ${regEntry.status || "unknown"}${exists ? "" : " (file missing)"}\n\nNote: Full metadata not available in metadata store. Use prepare_dataset to get full details.` }],
|
|
1917
|
-
};
|
|
1918
|
-
}
|
|
1919
|
-
return {
|
|
1920
|
-
content: [{ type: "text", text: `ERROR: Dataset not found: ${datasetId}. Use vesper_list_datasets to see available datasets, or prepare_dataset to add new ones.` }],
|
|
1921
|
-
isError: true,
|
|
1922
|
-
};
|
|
1923
|
-
}
|
|
1924
|
-
// Enrich: if total_examples is 0-ish, try the HF datasets-server /size API
|
|
1925
|
-
if ((!dataset.total_examples || dataset.total_examples === 0) && dataset.source === "huggingface") {
|
|
1926
|
-
try {
|
|
1927
|
-
const sizeResp = await fetch(`https://datasets-server.huggingface.co/size?dataset=${encodeURIComponent(dataset.id)}`);
|
|
1928
|
-
if (sizeResp.ok) {
|
|
1929
|
-
const sizeData = await sizeResp.json();
|
|
1930
|
-
const numRows = sizeData?.size?.dataset?.num_rows;
|
|
1931
|
-
if (numRows && numRows > 0) {
|
|
1932
|
-
dataset.total_examples = numRows;
|
|
1933
|
-
// Also backfill splits
|
|
1934
|
-
if (sizeData?.size?.splits && Array.isArray(sizeData.size.splits)) {
|
|
1935
|
-
dataset.splits = sizeData.size.splits.map((s) => ({
|
|
1936
|
-
name: s.split,
|
|
1937
|
-
num_examples: s.num_rows || 0,
|
|
1938
|
-
size_bytes: s.num_bytes_parquet_files || 0,
|
|
1939
|
-
}));
|
|
1940
|
-
dataset.has_train_split = dataset.splits.some((s) => s.name === "train");
|
|
1941
|
-
dataset.has_test_split = dataset.splits.some((s) => s.name === "test");
|
|
1942
|
-
dataset.has_validation_split = dataset.splits.some((s) => s.name === "validation" || s.name === "val");
|
|
1943
|
-
}
|
|
1944
|
-
// Persist enriched metadata
|
|
1945
|
-
metadataStore.saveDataset(dataset);
|
|
1946
|
-
}
|
|
1947
|
-
}
|
|
1948
|
-
}
|
|
1949
|
-
catch {
|
|
1950
|
-
// Enrichment is best-effort; continue with whatever we have
|
|
1951
|
-
}
|
|
1952
|
-
}
|
|
1953
|
-
const formattedOutput = formatDatasetInfo(dataset);
|
|
1954
|
-
return { content: [{ type: "text", text: formattedOutput }] };
|
|
1955
|
-
}
|
|
1956
|
-
case "analyze_quality": {
|
|
1957
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1958
|
-
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
1959
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1960
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1961
|
-
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1962
|
-
// Demo Fallback for easy testing
|
|
1963
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1964
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1965
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1966
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
1967
|
-
filePath = demoParquetPath;
|
|
1968
|
-
}
|
|
1969
|
-
else if (fs.existsSync(demoCsvPath)) {
|
|
1970
|
-
filePath = demoCsvPath;
|
|
1971
|
-
}
|
|
1972
|
-
else if (datasetId !== "demo") {
|
|
1973
|
-
return {
|
|
1974
|
-
content: [{ type: "text", text: `ERROR: Local data file for ${datasetId} not found. Try running prepare_dataset first, or use 'demo' as the dataset_id.` }],
|
|
1975
|
-
isError: true
|
|
1976
|
-
};
|
|
1977
|
-
}
|
|
1978
|
-
}
|
|
1979
|
-
const report = await qualityAnalyzer.analyze(filePath);
|
|
1980
|
-
return {
|
|
1981
|
-
content: [{ type: "text", text: JSON.stringify(report, null, 2) }]
|
|
1982
|
-
};
|
|
1983
|
-
}
|
|
1984
|
-
case "preview_cleaning": {
|
|
1985
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
1986
|
-
const safeId = toSafeDatasetPathFragment(datasetId);
|
|
1987
|
-
const parquetPath = path.join(dataRoot, "data", "raw", `${safeId}.parquet`);
|
|
1988
|
-
const csvPath = path.join(dataRoot, "data", "raw", `${safeId}.csv`);
|
|
1989
|
-
let filePath = fs.existsSync(parquetPath) ? parquetPath : csvPath;
|
|
1990
|
-
if (datasetId === "demo" || !fs.existsSync(filePath)) {
|
|
1991
|
-
const demoParquetPath = path.join(dataRoot, "e2e_demo_output", "raw_data.parquet");
|
|
1992
|
-
const demoCsvPath = path.join(dataRoot, "e2e_demo_output", "raw_data.csv");
|
|
1993
|
-
if (fs.existsSync(demoParquetPath)) {
|
|
1994
|
-
filePath = demoParquetPath;
|
|
1995
|
-
}
|
|
1996
|
-
else if (fs.existsSync(demoCsvPath)) {
|
|
1997
|
-
filePath = demoCsvPath;
|
|
1998
|
-
}
|
|
1999
|
-
else {
|
|
2000
|
-
throw new McpError(ErrorCode.InvalidParams, `Local data file not found for ${datasetId}. Please run prepare_dataset first.`);
|
|
2001
|
-
}
|
|
2002
|
-
}
|
|
2003
|
-
const report = await qualityAnalyzer.analyze(filePath);
|
|
2004
|
-
// Phase 1: Target Detection
|
|
2005
|
-
// We use the same TargetDetector instance inside CleaningPlanner now?
|
|
2006
|
-
// Actually, we instantiated it inside CleaningPlanner, so we just need to pass the file path
|
|
2007
|
-
// OR let the planner handle it if we update its signature to accept filePath.
|
|
2008
|
-
// Let's check `CleaningPlanner.generatePlan` signature again.
|
|
2009
|
-
// We updated it to accept `targetInfo`.
|
|
2010
|
-
// So we need to run detection HERE and pass it.
|
|
2011
|
-
// But `TargetDetector` is not exposed in `index.ts` scope yet.
|
|
2012
|
-
// Let's create a global instance or use the one inside planner if exposed (it's private).
|
|
2013
|
-
// Better approach: Instantiate TargetDetector here in index.ts for the tool content.
|
|
2014
|
-
// Quick fix: Instantiate local detector or make global.
|
|
2015
|
-
// I'll make a global `targetDetector` constant in index.ts
|
|
2016
|
-
// But wait, I updated `CleaningPlanner` to instantiate its own detector.
|
|
2017
|
-
// Does `CleaningPlanner` use it? No, I commented out the logic because it needed `filePath`.
|
|
2018
|
-
// RETRY STRATEGY:
|
|
2019
|
-
// 1. Instantiate `targetDetector` in `index.ts`.
|
|
2020
|
-
// 2. Run `detectTarget(filePath)`.
|
|
2021
|
-
// 3. Pass result to `cleaningPlanner.generatePlan(..., targetInfo)`.
|
|
2022
|
-
// I need to add `const targetDetector = new TargetDetector(__dirname);` to imports/init section first.
|
|
2023
|
-
// But since I'm in this tool, I can't look back.
|
|
2024
|
-
// I will assume I can add it, or just do it inside the case for now.
|
|
2025
|
-
// To do it properly, I should have added `targetDetector` to the global scope in previous step.
|
|
2026
|
-
// Let's do that in a separate step if needed.
|
|
2027
|
-
// For now, I'll instantiate it here.
|
|
2028
|
-
const { TargetDetector } = await import("./preparation/target-detector.js");
|
|
2029
|
-
const detector = new TargetDetector(__dirname);
|
|
2030
|
-
const targetResult = await detector.detectTarget(filePath);
|
|
2031
|
-
const targetInfo = targetResult.target_column ? {
|
|
2032
|
-
target: targetResult.target_column,
|
|
2033
|
-
confidence: targetResult.confidence
|
|
2034
|
-
} : undefined;
|
|
2035
|
-
const plan = await cleaningPlanner.generatePlan(datasetId, report, undefined, targetInfo);
|
|
2036
|
-
let explanation = `### Cleaning Plan for ${datasetId}\n\n`;
|
|
2037
|
-
if (targetInfo && targetInfo.target !== "target" && targetInfo.confidence > 0.7) {
|
|
2038
|
-
explanation += `**Target Detected**: '${targetInfo.target}' (Confidence: ${targetInfo.confidence.toFixed(2)})\n`;
|
|
2039
|
-
explanation += ` - **Action**: Renaming to 'target' for consistency.\n\n`;
|
|
2040
|
-
}
|
|
2041
|
-
explanation += `Estimated Quality Improvement: +${plan.estimated_impact.quality_score_improvement} points\n\n`;
|
|
2042
|
-
if (plan.operations.length === 0) {
|
|
2043
|
-
explanation += "No cleaning operations required.";
|
|
2044
|
-
}
|
|
2045
|
-
else {
|
|
2046
|
-
plan.operations.forEach((op, i) => {
|
|
2047
|
-
explanation += `${i + 1}. **${op.type}**: ${op.reason}\n`;
|
|
2048
|
-
});
|
|
2049
|
-
}
|
|
2050
|
-
return {
|
|
2051
|
-
content: [{ type: "text", text: explanation }]
|
|
2052
|
-
};
|
|
2053
|
-
}
|
|
2054
|
-
case "custom_clean": {
|
|
2055
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2056
|
-
const ops = request.params.arguments?.operations;
|
|
2057
|
-
if (!datasetId || datasetId === "undefined") {
|
|
2058
|
-
throw new McpError(ErrorCode.InvalidParams, "dataset_id is required");
|
|
2059
|
-
}
|
|
2060
|
-
if (!ops || !Array.isArray(ops) || ops.length === 0) {
|
|
2061
|
-
throw new McpError(ErrorCode.InvalidParams, "operations array is required and must not be empty. Supported operations: RemoveDuplicates, DropColumns, FillMissing, FixTypes, RemoveOutliers, EncodeCategories");
|
|
2062
|
-
}
|
|
2063
|
-
// Pre-check: verify dataset file exists before starting the job
|
|
2064
|
-
const cleanRegEntry = getRegistryEntry(datasetId);
|
|
2065
|
-
const cleanRegPath = cleanRegEntry?.local_path || cleanRegEntry?.path;
|
|
2066
|
-
const cleanDlStatus = metadataStore.getDownloadStatus(datasetId);
|
|
2067
|
-
const cleanSafeId = toSafeDatasetPathFragment(datasetId);
|
|
2068
|
-
const cleanDataExists = (cleanRegPath && fs.existsSync(cleanRegPath)) ||
|
|
2069
|
-
(cleanDlStatus?.local_path && fs.existsSync(cleanDlStatus.local_path)) ||
|
|
2070
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.parquet`)) ||
|
|
2071
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.csv`)) ||
|
|
2072
|
-
fs.existsSync(path.join(dataRoot, "data", "raw", `${cleanSafeId}.feather`)) ||
|
|
2073
|
-
fs.existsSync(datasetId);
|
|
2074
|
-
if (!cleanDataExists) {
|
|
2075
|
-
return {
|
|
2076
|
-
content: [{ type: "text", text: `Dataset '${datasetId}' not found locally. Download it first using download_dataset or prepare_dataset, then run custom_clean.` }],
|
|
2077
|
-
isError: true,
|
|
2078
|
-
};
|
|
2079
|
-
}
|
|
2080
|
-
const job = jobManager.createJob("clean", 0, { datasetId, ops });
|
|
2081
|
-
return {
|
|
2082
|
-
content: [{ type: "text", text: `Cleaning job started. ID: ${job.id}. Use check_job_status to monitor progress.` }]
|
|
2083
|
-
};
|
|
2084
|
-
}
|
|
2085
|
-
case "prepare_dataset": {
|
|
2086
|
-
hydrateExternalKeys();
|
|
2087
|
-
const query = String(request.params.arguments?.query);
|
|
2088
|
-
const requirements = request.params.arguments?.requirements ? String(request.params.arguments?.requirements) : undefined;
|
|
2089
|
-
const downloadImages = request.params.arguments?.download_images === true;
|
|
2090
|
-
const requestedOutputDir = request.params.arguments?.target_dir
|
|
2091
|
-
? String(request.params.arguments.target_dir).trim()
|
|
2092
|
-
: request.params.arguments?.output_dir
|
|
2093
|
-
? String(request.params.arguments.output_dir).trim()
|
|
2094
|
-
: "";
|
|
2095
|
-
const outputDir = requestedOutputDir || process.cwd();
|
|
2096
|
-
if (!query || query === "undefined") {
|
|
2097
|
-
throw new McpError(ErrorCode.InvalidParams, "query is required - describe the dataset you need or provide a dataset ID");
|
|
2098
|
-
}
|
|
2099
|
-
const job = jobManager.createJob("prepare", 0, { query, requirements, downloadImages, outputDir });
|
|
2100
|
-
return {
|
|
2101
|
-
content: [{ type: "text", text: `Preparation job started. ID: ${job.id}. Vesper is finding and preparing the best dataset for you. Use check_job_status to monitor progress.` }]
|
|
2102
|
-
};
|
|
2103
|
-
}
|
|
2104
|
-
case "compare_datasets": {
|
|
2105
|
-
const datasetIds = request.params.arguments?.dataset_ids;
|
|
2106
|
-
const datasets = datasetIds.map(id => metadataStore.getDataset(id)).filter(d => !!d);
|
|
2107
|
-
let comparison = "| Metric | " + datasets.map(d => d.name).join(" | ") + " |\n";
|
|
2108
|
-
comparison += "| :--- | " + datasets.map(() => " :---: ").join("|") + " |\n";
|
|
2109
|
-
comparison += "| Quality Score | " + datasets.map(d => `${d.quality_score}/100`).join(" | ") + " |\n";
|
|
2110
|
-
comparison += "| License | " + datasets.map(d => d.license.category).join(" | ") + " |\n";
|
|
2111
|
-
comparison += "| Downloads | " + datasets.map(d => d.downloads).join(" | ") + " |\n";
|
|
2112
|
-
comparison += "| Domain | " + datasets.map(d => d.domain).join(" | ") + " |\n";
|
|
2113
|
-
return {
|
|
2114
|
-
content: [{ type: "text", text: comparison }]
|
|
2115
|
-
};
|
|
2116
|
-
}
|
|
2117
|
-
case "check_job_status": {
|
|
2118
|
-
const jobId = String(request.params.arguments?.job_id);
|
|
2119
|
-
const job = metadataStore.getJob(jobId);
|
|
2120
|
-
if (!job) {
|
|
2121
|
-
throw new McpError(ErrorCode.InvalidParams, `Job not found: ${jobId}`);
|
|
2122
|
-
}
|
|
2123
|
-
const activeStatuses = new Set(["pending", "queued", "running", "retrying"]);
|
|
2124
|
-
const now = Date.now();
|
|
2125
|
-
const last = jobStatusLastPoll[jobId] || 0;
|
|
2126
|
-
const minPollMs = 3000;
|
|
2127
|
-
if (activeStatuses.has(job.status) && (now - last) < minPollMs) {
|
|
2128
|
-
const waitMs = minPollMs - (now - last);
|
|
2129
|
-
return {
|
|
2130
|
-
content: [{ type: "text", text: `No significant status change yet. Please poll again in ~${Math.ceil(waitMs / 1000)}s.` }]
|
|
2131
|
-
};
|
|
2132
|
-
}
|
|
2133
|
-
jobStatusLastPoll[jobId] = now;
|
|
2134
|
-
return {
|
|
2135
|
-
content: [{ type: "text", text: formatJobStatus(job) }]
|
|
2136
|
-
};
|
|
2137
|
-
}
|
|
2138
|
-
case "export_dataset": {
|
|
2139
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2140
|
-
const requestedTargetDir = request.params.arguments?.target_dir
|
|
2141
|
-
? String(request.params.arguments?.target_dir).trim()
|
|
2142
|
-
: request.params.arguments?.output_dir
|
|
2143
|
-
? String(request.params.arguments?.output_dir).trim()
|
|
2144
|
-
: "";
|
|
2145
|
-
const targetDir = requestedTargetDir || process.cwd();
|
|
2146
|
-
const requestedFormat = String(request.params.arguments?.format || "feather");
|
|
2147
|
-
const fastMode = request.params.arguments?.fast === true;
|
|
2148
|
-
const preview = request.params.arguments?.preview === true;
|
|
2149
|
-
const sampleRows = request.params.arguments?.sample_rows ? Number(request.params.arguments.sample_rows) : undefined;
|
|
2150
|
-
const columns = request.params.arguments?.columns;
|
|
2151
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2152
|
-
// Use Metadata or Registry to find the actual local file
|
|
2153
|
-
let sourcePath = resolveDatasetLocalPath(datasetId);
|
|
2154
|
-
if (!sourcePath) {
|
|
2155
|
-
console.error(`[Export] No local data found for ${datasetId}. Attempting to prepare automatically...`);
|
|
2156
|
-
// Start a prepare job for this dataset id (acts like calling prepare_dataset)
|
|
2157
|
-
try {
|
|
2158
|
-
jobManager.createJob("prepare", 0, { query: datasetId, requirements: undefined, downloadImages: false, outputDir: process.cwd() });
|
|
2159
|
-
}
|
|
2160
|
-
catch (e) {
|
|
2161
|
-
console.error(`[Export] Failed to start prepare job for ${datasetId}: ${e?.message || e}`);
|
|
2162
|
-
}
|
|
2163
|
-
// Poll for download status or registry entry until local_path appears or timeout
|
|
2164
|
-
const wait = (ms) => new Promise(res => setTimeout(res, ms));
|
|
2165
|
-
const maxWait = 120_000; // 120s
|
|
2166
|
-
const interval = 2000;
|
|
2167
|
-
let waited = 0;
|
|
2168
|
-
while (waited < maxWait) {
|
|
2169
|
-
const resolved = resolveDatasetLocalPath(datasetId);
|
|
2170
|
-
if (resolved) {
|
|
2171
|
-
sourcePath = resolved;
|
|
2172
|
-
console.error(`[Export] Local data is now available for ${datasetId}: ${sourcePath}`);
|
|
2173
|
-
break;
|
|
2174
|
-
}
|
|
2175
|
-
await wait(interval);
|
|
2176
|
-
waited += interval;
|
|
2177
|
-
}
|
|
2178
|
-
// If still no sourcePath, return helpful error listing prepared datasets
|
|
2179
|
-
if (!sourcePath) {
|
|
2180
|
-
const entries = readRegistry();
|
|
2181
|
-
const listText = entries.length === 0 ? "(no prepared datasets found)" : entries.map(e => `- ${e.dataset_id || e.id || "unknown"}: ${e.local_path || e.path || "unknown"}`).slice(0, 10).join("\n") + (entries.length > 10 ? "\n...and " + (entries.length - 10) + " more" : "");
|
|
2182
|
-
return {
|
|
2183
|
-
content: [{ type: "text", text: `ERROR: No local data found for ${datasetId} after attempting prepare. Check credentials and try running prepare_dataset manually. Prepared datasets:\n${listText}` }],
|
|
2184
|
-
isError: true
|
|
2185
|
-
};
|
|
2186
|
-
}
|
|
2187
|
-
}
|
|
2188
|
-
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
2189
|
-
try {
|
|
2190
|
-
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2191
|
-
}
|
|
2192
|
-
catch (e) {
|
|
2193
|
-
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
2194
|
-
}
|
|
2195
|
-
// If NOT fast mode, run quality/cleaning pipeline first (only for csv/parquet compat)
|
|
2196
|
-
if (!fastMode) {
|
|
2197
|
-
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
2198
|
-
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
2199
|
-
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
2200
|
-
if (!pipelineCompatibleInput) {
|
|
2201
|
-
console.error(`[Export] Skipping pipeline for ${currentExt} input; using direct exporter conversion.`);
|
|
2202
|
-
}
|
|
2203
|
-
else if (currentExt !== pipelineFmt) {
|
|
2204
|
-
console.error(`[Export] Running quality/cleaning pipeline (use fast=true to skip)...`);
|
|
2205
|
-
try {
|
|
2206
|
-
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2207
|
-
if (pipelineResult.final_output_path) {
|
|
2208
|
-
sourcePath = pipelineResult.final_output_path;
|
|
2209
|
-
try {
|
|
2210
|
-
// Update registry to point to pipeline's final output
|
|
2211
|
-
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2212
|
-
}
|
|
2213
|
-
catch (e) {
|
|
2214
|
-
console.error(`[Registry] Failed to update registry for ${datasetId}: ${e?.message || e}`);
|
|
2215
|
-
}
|
|
2216
|
-
}
|
|
2217
|
-
}
|
|
2218
|
-
catch (err) {
|
|
2219
|
-
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
2220
|
-
}
|
|
2221
|
-
}
|
|
2222
|
-
}
|
|
2223
|
-
else {
|
|
2224
|
-
console.error(`[Export] Fast mode - skipping quality analysis and cleaning`);
|
|
2225
|
-
}
|
|
2226
|
-
// Build export options
|
|
2227
|
-
const exportOpts = {};
|
|
2228
|
-
if (compression)
|
|
2229
|
-
exportOpts.compression = compression;
|
|
2230
|
-
if (preview)
|
|
2231
|
-
exportOpts.preview = true;
|
|
2232
|
-
if (sampleRows)
|
|
2233
|
-
exportOpts.sample_rows = sampleRows;
|
|
2234
|
-
if (columns)
|
|
2235
|
-
exportOpts.columns = columns;
|
|
2236
|
-
try {
|
|
2237
|
-
// Determine output file name
|
|
2238
|
-
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2239
|
-
const ext = extMap[requestedFormat] || ".feather";
|
|
2240
|
-
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2241
|
-
const outDir = targetDir;
|
|
2242
|
-
if (!fs.existsSync(outDir))
|
|
2243
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
2244
|
-
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2245
|
-
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2246
|
-
// Build rich response
|
|
2247
|
-
let msg = `**Export complete**\n`;
|
|
2248
|
-
msg += `- **File**: ${result.output_path}\n`;
|
|
2249
|
-
msg += `- **Format**: ${result.format}${result.compression ? ` (${result.compression})` : ""}\n`;
|
|
2250
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2251
|
-
if (result.file_size_mb !== undefined)
|
|
2252
|
-
msg += `- **Size**: ${result.file_size_mb} MB\n`;
|
|
2253
|
-
if (result.elapsed_seconds !== undefined)
|
|
2254
|
-
msg += `- **Time**: ${result.elapsed_seconds}s\n`;
|
|
2255
|
-
if (result.preview_path)
|
|
2256
|
-
msg += `- **Preview**: ${result.preview_path}\n`;
|
|
2257
|
-
msg += `\n`;
|
|
2258
|
-
if (requestedFormat === "feather") {
|
|
2259
|
-
msg += `**Inspect with:**\n`;
|
|
2260
|
-
msg += ` Python: \`pd.read_feather('${result.output_path}').head()\`\n`;
|
|
2261
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2262
|
-
}
|
|
2263
|
-
else if (requestedFormat === "parquet") {
|
|
2264
|
-
msg += `**Inspect with:**\n`;
|
|
2265
|
-
msg += ` Python: \`pd.read_parquet('${result.output_path}').head()\`\n`;
|
|
2266
|
-
msg += ` DuckDB: \`SELECT * FROM '${result.output_path}' LIMIT 50;\`\n`;
|
|
2267
|
-
}
|
|
2268
|
-
return { content: [{ type: "text", text: msg }] };
|
|
2269
|
-
}
|
|
2270
|
-
catch (error) {
|
|
2271
|
-
return {
|
|
2272
|
-
content: [{ type: "text", text: `ERROR: Export failed: ${error.message}` }],
|
|
2273
|
-
isError: true
|
|
2274
|
-
};
|
|
2275
|
-
}
|
|
2276
|
-
}
|
|
2277
|
-
case "vesper_list_datasets": {
|
|
2278
|
-
const entries = readRegistry();
|
|
2279
|
-
if (entries.length === 0) {
|
|
2280
|
-
return {
|
|
2281
|
-
content: [{ type: "text", text: "No prepared datasets found in the Vesper registry. Use prepare_dataset or download_dataset to add datasets." }]
|
|
2282
|
-
};
|
|
2283
|
-
}
|
|
2284
|
-
const lines = entries.map((e, i) => {
|
|
2285
|
-
const id = e.dataset_id || e.id || "unknown";
|
|
2286
|
-
const localPath = e.local_path || e.path || "unknown";
|
|
2287
|
-
const exists = typeof localPath === "string" && localPath !== "unknown" && fs.existsSync(localPath);
|
|
2288
|
-
return `${i + 1}. **${id}**\n Path: ${localPath}\n Status: ${e.status || "unknown"}${exists ? "" : " (file missing)"}`;
|
|
2289
|
-
});
|
|
2290
|
-
return {
|
|
2291
|
-
content: [{ type: "text", text: `**Vesper Registry** (${entries.length} dataset${entries.length !== 1 ? "s" : ""}):\n\n${lines.join("\n\n")}` }]
|
|
2292
|
-
};
|
|
2293
|
-
}
|
|
2294
|
-
case "vesper_convert_format": {
|
|
2295
|
-
const filePath = String(request.params.arguments?.file_path || "").trim();
|
|
2296
|
-
const targetFormat = String(request.params.arguments?.target_format || "").trim().toLowerCase();
|
|
2297
|
-
if (!filePath) {
|
|
2298
|
-
throw new McpError(ErrorCode.InvalidParams, "file_path is required");
|
|
2299
|
-
}
|
|
2300
|
-
if (!["csv", "parquet", "json", "jsonl"].includes(targetFormat)) {
|
|
2301
|
-
throw new McpError(ErrorCode.InvalidParams, "target_format must be one of: csv, parquet, json, jsonl");
|
|
2302
|
-
}
|
|
2303
|
-
if (!fs.existsSync(filePath)) {
|
|
2304
|
-
return {
|
|
2305
|
-
content: [{ type: "text", text: `ERROR: File not found: ${filePath}` }],
|
|
2306
|
-
isError: true,
|
|
2307
|
-
};
|
|
2308
|
-
}
|
|
2309
|
-
const inputExt = path.extname(filePath).toLowerCase();
|
|
2310
|
-
const extMap = { csv: ".csv", parquet: ".parquet", json: ".json", jsonl: ".jsonl" };
|
|
2311
|
-
const outputExt = extMap[targetFormat];
|
|
2312
|
-
if (inputExt === outputExt) {
|
|
2313
|
-
return {
|
|
2314
|
-
content: [{ type: "text", text: `File is already in ${targetFormat} format: ${filePath}` }],
|
|
2315
|
-
};
|
|
2316
|
-
}
|
|
2317
|
-
const outputPath = filePath.replace(/\.[^.]+$/, outputExt);
|
|
2318
|
-
try {
|
|
2319
|
-
await ensurePythonModules([{ module: "polars", packageName: "polars" }]);
|
|
2320
|
-
const convertScript = path.join(dataRoot, "python", "convert_engine.py");
|
|
2321
|
-
const result = await runPythonJson(convertScript, [filePath, outputPath]);
|
|
2322
|
-
if (!result.ok) {
|
|
2323
|
-
return {
|
|
2324
|
-
content: [{ type: "text", text: `ERROR: Conversion failed: ${result.error}` }],
|
|
2325
|
-
isError: true,
|
|
2326
|
-
};
|
|
2327
|
-
}
|
|
2328
|
-
// Register converted file in the registry
|
|
2329
|
-
const datasetId = path.basename(outputPath, outputExt);
|
|
2330
|
-
try {
|
|
2331
|
-
upsertRegistry(datasetId, outputPath, "completed");
|
|
2332
|
-
}
|
|
2333
|
-
catch (e) {
|
|
2334
|
-
console.error(`[Convert] Registry write failed: ${e?.message || e}`);
|
|
2335
|
-
}
|
|
2336
|
-
let msg = `**Conversion complete**\n`;
|
|
2337
|
-
msg += `- **Input**: ${filePath} (${inputExt.slice(1)})\n`;
|
|
2338
|
-
msg += `- **Output**: ${result.output_path} (${targetFormat})\n`;
|
|
2339
|
-
msg += `- **Rows**: ${result.rows?.toLocaleString()}${result.columns ? " × " + result.columns + " cols" : ""}\n`;
|
|
2340
|
-
if (result.size_mb !== undefined)
|
|
2341
|
-
msg += `- **Size**: ${result.size_mb} MB\n`;
|
|
2342
|
-
return { content: [{ type: "text", text: msg }] };
|
|
2343
|
-
}
|
|
2344
|
-
catch (error) {
|
|
2345
|
-
return {
|
|
2346
|
-
content: [{ type: "text", text: `ERROR: Conversion failed: ${error.message}` }],
|
|
2347
|
-
isError: true,
|
|
2348
|
-
};
|
|
2349
|
-
}
|
|
2350
|
-
}
|
|
2351
|
-
case "fuse_datasets": {
|
|
2352
|
-
const rawSources = request.params.arguments?.sources;
|
|
2353
|
-
if (!rawSources || !Array.isArray(rawSources) || rawSources.length < 2) {
|
|
2354
|
-
throw new McpError(ErrorCode.InvalidParams, "sources must contain at least 2 dataset IDs/paths");
|
|
2355
|
-
}
|
|
2356
|
-
const strategy = request.params.arguments?.strategy || "concat";
|
|
2357
|
-
const joinOn = request.params.arguments?.join_on;
|
|
2358
|
-
const how = request.params.arguments?.how || "inner";
|
|
2359
|
-
const dedup = request.params.arguments?.dedup !== false;
|
|
2360
|
-
const runQualityAfter = request.params.arguments?.run_quality_after !== false;
|
|
2361
|
-
const leakageCheck = request.params.arguments?.leakage_check !== false;
|
|
2362
|
-
const outputFormat = request.params.arguments?.output_format || "feather";
|
|
2363
|
-
const compression = request.params.arguments?.compression ? String(request.params.arguments.compression) : undefined;
|
|
2364
|
-
const preview = request.params.arguments?.preview !== false;
|
|
2365
|
-
const resolvedPaths = [];
|
|
2366
|
-
const unresolved = [];
|
|
2367
|
-
for (const src of rawSources) {
|
|
2368
|
-
if (fs.existsSync(src)) {
|
|
2369
|
-
resolvedPaths.push(src);
|
|
2370
|
-
continue;
|
|
2371
|
-
}
|
|
2372
|
-
const status = metadataStore.getDownloadStatus(src);
|
|
2373
|
-
if (status?.local_path && fs.existsSync(status.local_path)) {
|
|
2374
|
-
resolvedPaths.push(status.local_path);
|
|
2375
|
-
continue;
|
|
2376
|
-
}
|
|
2377
|
-
unresolved.push(src);
|
|
2378
|
-
}
|
|
2379
|
-
if (unresolved.length > 0) {
|
|
2380
|
-
return {
|
|
2381
|
-
content: [{
|
|
2382
|
-
type: "text",
|
|
2383
|
-
text: `ERROR: Could not resolve these sources to local files: ${unresolved.join(", ")}. Provide local paths or run prepare_dataset first.`
|
|
2384
|
-
}],
|
|
2385
|
-
isError: true
|
|
2386
|
-
};
|
|
2387
|
-
}
|
|
2388
|
-
try {
|
|
2389
|
-
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow" };
|
|
2390
|
-
const ext = extMap[outputFormat] || ".feather";
|
|
2391
|
-
const outDir = process.cwd();
|
|
2392
|
-
if (!fs.existsSync(outDir))
|
|
2393
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
2394
|
-
const outputPath = path.join(outDir, `fused_${Date.now()}${ext}`);
|
|
2395
|
-
console.error(`[Fusion] Resolved output directory: ${outDir}`);
|
|
2396
|
-
const result = await fusionEngine.fuse(resolvedPaths, outputPath, {
|
|
2397
|
-
strategy,
|
|
2398
|
-
join_on: joinOn,
|
|
2399
|
-
how,
|
|
2400
|
-
dedup,
|
|
2401
|
-
run_quality_after: runQualityAfter,
|
|
2402
|
-
leakage_check: leakageCheck,
|
|
2403
|
-
output_format: outputFormat,
|
|
2404
|
-
compression: compression,
|
|
2405
|
-
preview,
|
|
2406
|
-
});
|
|
2407
|
-
const nullDelta = result.stats.null_delta;
|
|
2408
|
-
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
2409
|
-
// Register fused dataset under a generated id so users can export it easily
|
|
2410
|
-
const fusedId = `fused_${Date.now()}`;
|
|
2411
|
-
try {
|
|
2412
|
-
upsertRegistry(fusedId, result.output_path, "completed");
|
|
2413
|
-
}
|
|
2414
|
-
catch (e) {
|
|
2415
|
-
console.error(`[Registry] Failed to register fused dataset ${fusedId}: ${e?.message || e}`);
|
|
2416
|
-
}
|
|
2417
|
-
let msg = `Fused ${result.stats.sources_count} sources -> ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).\n`;
|
|
2418
|
-
msg += `- Duplicates removed: ${result.stats.duplicates_removed.toLocaleString()}\n`;
|
|
2419
|
-
msg += `- Null change: ${nullText}\n`;
|
|
2420
|
-
msg += `- Output: ${result.output_path}\n`;
|
|
2421
|
-
if (result.preview_path)
|
|
2422
|
-
msg += `- Preview: ${result.preview_path}\n`;
|
|
2423
|
-
if (result.leakage_report) {
|
|
2424
|
-
msg += `- Leakage: ${result.leakage_report.leakage_detected ? "detected" : "none"}`;
|
|
2425
|
-
if (result.leakage_report.leakage_count) {
|
|
2426
|
-
msg += ` (${result.leakage_report.leakage_count})`;
|
|
2427
|
-
}
|
|
2428
|
-
msg += "\n";
|
|
2429
|
-
}
|
|
2430
|
-
msg += `\nNext: run split_dataset/export_dataset on fused output. Registered fused dataset id: ${fusedId}`;
|
|
2431
|
-
return { content: [{ type: "text", text: msg }] };
|
|
2432
|
-
}
|
|
2433
|
-
catch (error) {
|
|
2434
|
-
return {
|
|
2435
|
-
content: [{ type: "text", text: `ERROR: Fusion failed: ${error.message}` }],
|
|
2436
|
-
isError: true
|
|
2437
|
-
};
|
|
2438
|
-
}
|
|
2439
|
-
}
|
|
2440
|
-
case "analyze_image_quality": {
|
|
2441
|
-
const inputPath = String(request.params.arguments?.path);
|
|
2442
|
-
if (!fs.existsSync(inputPath)) {
|
|
2443
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2444
|
-
}
|
|
2445
|
-
try {
|
|
2446
|
-
const report = await imageAnalyzer.analyze(inputPath);
|
|
2447
|
-
let output = `## Image Quality Report\n\n`;
|
|
2448
|
-
output += `- **Total Images**: ${report.total_images}\n`;
|
|
2449
|
-
output += `- **Corrupted**: ${report.corrupted_count}\n`;
|
|
2450
|
-
output += `- **Average Resolution**: ${Math.round(report.average_width)}x${Math.round(report.average_height)}\n`;
|
|
2451
|
-
output += `- **Blurry Images**: ${report.blurry_count}\n\n`;
|
|
2452
|
-
if (report.individual_results.length > 0) {
|
|
2453
|
-
output += `### Sample Detail (Top 5)\n`;
|
|
2454
|
-
report.individual_results.slice(0, 5).forEach(img => {
|
|
2455
|
-
const statusLabel = img.status === "ok" ? (img.is_blurry ? "[warn]" : "[ok]") : "[error]";
|
|
2456
|
-
output += `${statusLabel} **${img.filename}**: ${img.width}x${img.height} | Mode: ${img.mode} | Blur: ${img.blur_score?.toFixed(1)}\n`;
|
|
2457
|
-
});
|
|
2458
|
-
}
|
|
2459
|
-
return {
|
|
2460
|
-
content: [{ type: "text", text: output }]
|
|
2461
|
-
};
|
|
2462
|
-
}
|
|
2463
|
-
catch (error) {
|
|
2464
|
-
return {
|
|
2465
|
-
content: [{ type: "text", text: `ERROR: Image analysis failed: ${error.message}` }],
|
|
2466
|
-
isError: true
|
|
2467
|
-
};
|
|
2468
|
-
}
|
|
2469
|
-
}
|
|
2470
|
-
case "analyze_media_quality": {
|
|
2471
|
-
const inputPath = String(request.params.arguments?.path);
|
|
2472
|
-
if (!fs.existsSync(inputPath)) {
|
|
2473
|
-
throw new McpError(ErrorCode.InvalidParams, `Path not found: ${inputPath}`);
|
|
2474
|
-
}
|
|
2475
|
-
try {
|
|
2476
|
-
const report = await mediaAnalyzer.analyze(inputPath);
|
|
2477
|
-
let output = `## Media Quality Report\n\n`;
|
|
2478
|
-
output += `- **Total Files**: ${report.total_files}\n`;
|
|
2479
|
-
output += `- **OK Files**: ${report.ok_files}\n`;
|
|
2480
|
-
output += `- **Failed Files**: ${report.failed_files}\n`;
|
|
2481
|
-
if ('avg_audio_duration' in report && report.avg_audio_duration) {
|
|
2482
|
-
output += `- **Average Audio Duration**: ${report.avg_audio_duration.toFixed(2)}s\n`;
|
|
2483
|
-
}
|
|
2484
|
-
if ('avg_video_duration' in report && report.avg_video_duration) {
|
|
2485
|
-
output += `- **Average Video Duration**: ${report.avg_video_duration.toFixed(2)}s\n`;
|
|
2486
|
-
output += `- **Average FPS**: ${report.avg_fps?.toFixed(1)}\n`;
|
|
2487
|
-
}
|
|
2488
|
-
output += `\n### Sample Detail (Top 5)\n`;
|
|
2489
|
-
report.details.slice(0, 5).forEach(item => {
|
|
2490
|
-
const statusLabel = item.status === "ok" ? "[ok]" : "[error]";
|
|
2491
|
-
if (item.type === "audio" && 'sample_rate' in item) {
|
|
2492
|
-
output += `${statusLabel} **${item.filename}**: ${item.duration?.toFixed(1)}s | SR: ${item.sample_rate}Hz | Silent: ${item.is_silent}\n`;
|
|
2493
|
-
}
|
|
2494
|
-
else if (item.type === "video" && 'width' in item) {
|
|
2495
|
-
output += `${statusLabel} **${item.filename}**: ${item.width}x${item.height} | ${item.fps?.toFixed(1)}fps | Risk: ${item.corruption_risk}\n`;
|
|
2496
|
-
}
|
|
2497
|
-
else {
|
|
2498
|
-
output += `${statusLabel} **${item.filename}**: ${item.error}\n`;
|
|
2499
|
-
}
|
|
2500
|
-
});
|
|
2501
|
-
return {
|
|
2502
|
-
content: [{ type: "text", text: output }]
|
|
2503
|
-
};
|
|
2504
|
-
}
|
|
2505
|
-
catch (error) {
|
|
2506
|
-
return {
|
|
2507
|
-
content: [{ type: "text", text: `ERROR: Media analysis failed: ${error.message}` }],
|
|
2508
|
-
isError: true
|
|
2509
|
-
};
|
|
2510
|
-
}
|
|
2511
|
-
}
|
|
2512
|
-
case "generate_quality_report": {
|
|
2513
|
-
const datasetId = String(request.params.arguments?.dataset_id);
|
|
2514
|
-
const datasetPath = String(request.params.arguments?.dataset_path);
|
|
2515
|
-
if (!fs.existsSync(datasetPath)) {
|
|
2516
|
-
throw new McpError(ErrorCode.InvalidParams, `Dataset path not found: ${datasetPath}`);
|
|
2517
|
-
}
|
|
2518
|
-
try {
|
|
2519
|
-
// Optionally load text quality from metadata if available
|
|
2520
|
-
const metadata = await metadataStore.getDataset(datasetId);
|
|
2521
|
-
// TODO: Integrate text quality analysis when available
|
|
2522
|
-
const textQuality = null;
|
|
2523
|
-
const report = await qualityOrchestrator.generateReport(datasetId, datasetPath, textQuality);
|
|
2524
|
-
// Save report to metadata
|
|
2525
|
-
if (metadata) {
|
|
2526
|
-
metadata.unified_quality_report = report;
|
|
2527
|
-
await metadataStore.saveDataset(metadata);
|
|
2528
|
-
}
|
|
2529
|
-
let output = `# Unified Quality Report\n\n`;
|
|
2530
|
-
output += `**Dataset**: ${datasetId}\n`;
|
|
2531
|
-
output += `**Modalities**: ${report.modalities.join(", ")}\n`;
|
|
2532
|
-
output += `**Overall Quality Score**: ${report.overall_quality_score}/100\n\n`;
|
|
2533
|
-
if (report.text_quality) {
|
|
2534
|
-
output += `## Text Quality\n`;
|
|
2535
|
-
output += `- Rows: ${report.text_quality.row_count}\n`;
|
|
2536
|
-
output += `- Columns: ${report.text_quality.column_count}\n`;
|
|
2537
|
-
output += `- Missing: ${report.text_quality.missing_percentage.toFixed(1)}%\n`;
|
|
2538
|
-
output += `- Duplicates: ${report.text_quality.duplicate_percentage.toFixed(1)}%\n\n`;
|
|
2539
|
-
}
|
|
2540
|
-
if (report.image_quality) {
|
|
2541
|
-
output += `## Image Quality\n`;
|
|
2542
|
-
output += `- Total Images: ${report.image_quality.total_images}\n`;
|
|
2543
|
-
output += `- Corrupted: ${report.image_quality.corrupted_count}\n`;
|
|
2544
|
-
output += `- Avg Resolution: ${report.image_quality.avg_resolution}\n`;
|
|
2545
|
-
output += `- Blurry: ${report.image_quality.blurry_percentage.toFixed(1)}%\n\n`;
|
|
2546
|
-
}
|
|
2547
|
-
if (report.audio_quality) {
|
|
2548
|
-
output += `## Audio Quality\n`;
|
|
2549
|
-
output += `- Total Files: ${report.audio_quality.total_files}\n`;
|
|
2550
|
-
output += `- Avg Duration: ${report.audio_quality.avg_duration.toFixed(1)}s\n`;
|
|
2551
|
-
output += `- Avg Sample Rate: ${report.audio_quality.avg_sample_rate.toFixed(0)}Hz\n`;
|
|
2552
|
-
output += `- Silent: ${report.audio_quality.silent_percentage.toFixed(1)}%\n\n`;
|
|
2553
|
-
}
|
|
2554
|
-
if (report.video_quality) {
|
|
2555
|
-
output += `## Video Quality\n`;
|
|
2556
|
-
output += `- Total Files: ${report.video_quality.total_files}\n`;
|
|
2557
|
-
output += `- Avg Duration: ${report.video_quality.avg_duration.toFixed(1)}s\n`;
|
|
2558
|
-
output += `- Avg FPS: ${report.video_quality.avg_fps.toFixed(1)}\n`;
|
|
2559
|
-
output += `- High Corruption Risk: ${report.video_quality.corruption_risk_high}\n\n`;
|
|
2560
|
-
}
|
|
2561
|
-
output += `## Recommendations\n`;
|
|
2562
|
-
report.recommendations.forEach(rec => {
|
|
2563
|
-
output += `- ${rec}\n`;
|
|
2564
|
-
});
|
|
2565
|
-
return {
|
|
2566
|
-
content: [{ type: "text", text: output }]
|
|
2567
|
-
};
|
|
2568
|
-
}
|
|
2569
|
-
catch (error) {
|
|
2570
|
-
return {
|
|
2571
|
-
content: [{ type: "text", text: `ERROR: Quality report generation failed: ${error.message}` }],
|
|
2572
|
-
isError: true
|
|
2573
|
-
};
|
|
2574
|
-
}
|
|
2575
|
-
}
|
|
2576
|
-
default:
|
|
2577
|
-
throw new McpError(ErrorCode.MethodNotFound, "Tool not found");
|
|
2578
|
-
}
|
|
2579
|
-
}); // end requestQueue.enqueue
|
|
2580
|
-
});
|
|
2581
|
-
async function main() {
|
|
2582
|
-
const args = process.argv.slice(2);
|
|
2583
|
-
hydrateExternalKeys();
|
|
2584
|
-
const isFuse = args.includes("fuse");
|
|
2585
|
-
const isDiscover = args.includes("discover");
|
|
2586
|
-
const isDownload = args.includes("download");
|
|
2587
|
-
const isExport = args.includes("export");
|
|
2588
|
-
const isConfig = args.includes("config") || args.includes("configure");
|
|
2589
|
-
const isSetup = args.includes("--setup") || args.includes("setup");
|
|
2590
|
-
const isSilent = args.includes("--silent");
|
|
2591
|
-
if (process.stdin.isTTY && !isSilent) {
|
|
2592
|
-
printLaunchScreen();
|
|
2593
|
-
}
|
|
2594
|
-
if (isFuse) {
|
|
2595
|
-
await runFuseCli(args);
|
|
2596
|
-
return;
|
|
2597
|
-
}
|
|
2598
|
-
if (isConfig) {
|
|
2599
|
-
await runConfigCli(args);
|
|
2600
|
-
return;
|
|
2601
|
-
}
|
|
2602
|
-
if (isDiscover) {
|
|
2603
|
-
await runDiscoverCli(args);
|
|
2604
|
-
return;
|
|
2605
|
-
}
|
|
2606
|
-
if (isDownload) {
|
|
2607
|
-
await runDownloadCli(args);
|
|
2608
|
-
return;
|
|
2609
|
-
}
|
|
2610
|
-
if (isExport) {
|
|
2611
|
-
await runExportCli(args);
|
|
2612
|
-
return;
|
|
2613
|
-
}
|
|
2614
|
-
// If run in explicit setup mode, show setup wizard (do not auto-run on server startup)
|
|
2615
|
-
if (isSetup) {
|
|
2616
|
-
await runSetupWizard(isSilent);
|
|
2617
|
-
return;
|
|
2618
|
-
}
|
|
2619
|
-
// Otherwise proceed to server mode (for IDEs/Agents)
|
|
2620
|
-
console.error(`[Vesper] Starting server...`);
|
|
2621
|
-
console.error(`[Vesper] dataRoot: ${dataRoot}`);
|
|
2622
|
-
console.error(`[Vesper] dbPath: ${dbPath}`);
|
|
2623
|
-
const transport = new StdioServerTransport();
|
|
2624
|
-
await server.connect(transport);
|
|
2625
|
-
console.error("Vesper MCP server running on stdio");
|
|
2626
|
-
console.error("Tip: To configure Vesper for your IDE, run: npx -y -p @vespermcp/mcp-server@latest vespermcp --setup");
|
|
2627
|
-
await new Promise((resolve) => {
|
|
2628
|
-
const done = () => resolve();
|
|
2629
|
-
process.stdin.resume();
|
|
2630
|
-
process.stdin.once("end", done);
|
|
2631
|
-
process.stdin.once("close", done);
|
|
2632
|
-
process.once("SIGINT", done);
|
|
2633
|
-
process.once("SIGTERM", done);
|
|
2634
|
-
});
|
|
2635
|
-
console.error("[Vesper] Main loop finished");
|
|
2636
|
-
}
|
|
2637
|
-
async function runConfigCli(args) {
|
|
2638
|
-
const isKeys = args.includes("keys");
|
|
2639
|
-
const isKaggle = args.includes("kaggle");
|
|
2640
|
-
if (!(isKeys || isKaggle) || args.includes("--help")) {
|
|
2641
|
-
console.log("Usage: vespermcp config keys");
|
|
2642
|
-
console.log(" vespermcp configure keys");
|
|
2643
|
-
console.log(" vespermcp config kaggle --username <name> --key <api_key>");
|
|
2644
|
-
console.log(" vespermcp configure kaggle --username <name> --key <api_key>");
|
|
2645
|
-
console.log("Core Vesper tools work with zero API keys.");
|
|
2646
|
-
return;
|
|
2647
|
-
}
|
|
2648
|
-
const getArgValue = (name) => {
|
|
2649
|
-
const idx = args.findIndex(a => a === name);
|
|
2650
|
-
if (idx >= 0 && idx + 1 < args.length)
|
|
2651
|
-
return args[idx + 1];
|
|
2652
|
-
return undefined;
|
|
2653
|
-
};
|
|
2654
|
-
if (isKeys) {
|
|
2655
|
-
console.log("\nVesper Optional Keys Setup");
|
|
2656
|
-
console.log("(Press Enter to skip any field)\n");
|
|
2657
|
-
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
2658
|
-
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
2659
|
-
const current = secureKeys.getAll();
|
|
2660
|
-
const hfToken = (await ask(`Hugging Face token [${current.hf_token ? "saved" : "empty"}]: `)).trim();
|
|
2661
|
-
const kaggleUsername = (await ask(`Kaggle username [${current.kaggle_username ? "saved" : "empty"}]: `)).trim();
|
|
2662
|
-
const kaggleKey = (await ask(`Kaggle key [${current.kaggle_key ? "saved" : "empty"}]: `)).trim();
|
|
2663
|
-
const dataworldToken = (await ask(`data.world token [${current.dataworld_token ? "saved" : "empty"}]: `)).trim();
|
|
2664
|
-
rl.close();
|
|
2665
|
-
const saved = [];
|
|
2666
|
-
if (hfToken) {
|
|
2667
|
-
const res = secureKeys.set("hf_token", hfToken);
|
|
2668
|
-
if (res.ok) {
|
|
2669
|
-
process.env.HF_TOKEN = hfToken;
|
|
2670
|
-
saved.push("HF token");
|
|
2671
|
-
}
|
|
2672
|
-
}
|
|
2673
|
-
if (kaggleUsername) {
|
|
2674
|
-
const res = secureKeys.set("kaggle_username", kaggleUsername);
|
|
2675
|
-
if (res.ok) {
|
|
2676
|
-
process.env.KAGGLE_USERNAME = kaggleUsername;
|
|
2677
|
-
saved.push("Kaggle username");
|
|
2678
|
-
}
|
|
2679
|
-
}
|
|
2680
|
-
if (kaggleKey) {
|
|
2681
|
-
const res = secureKeys.set("kaggle_key", kaggleKey);
|
|
2682
|
-
if (res.ok) {
|
|
2683
|
-
process.env.KAGGLE_KEY = kaggleKey;
|
|
2684
|
-
saved.push("Kaggle key");
|
|
2685
|
-
}
|
|
2686
|
-
}
|
|
2687
|
-
if (dataworldToken) {
|
|
2688
|
-
const res = secureKeys.set("dataworld_token", dataworldToken);
|
|
2689
|
-
if (res.ok) {
|
|
2690
|
-
process.env.DW_AUTH_TOKEN = dataworldToken;
|
|
2691
|
-
saved.push("data.world token");
|
|
2692
|
-
}
|
|
2693
|
-
}
|
|
2694
|
-
if (saved.length === 0) {
|
|
2695
|
-
console.log("No new keys saved (all skipped). Core tools continue to work without keys.");
|
|
2696
|
-
return;
|
|
2697
|
-
}
|
|
2698
|
-
console.log(`Key(s) saved securely: ${saved.join(", ")}`);
|
|
2699
|
-
console.log("You can now use Kaggle, data.world, and gated Hugging Face datasets.");
|
|
2700
|
-
return;
|
|
2701
|
-
}
|
|
2702
|
-
// Backward-compatible Kaggle-specific path
|
|
2703
|
-
let username = getArgValue("--username") || "";
|
|
2704
|
-
let key = getArgValue("--key") || "";
|
|
2705
|
-
if (!username || !key) {
|
|
2706
|
-
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
2707
|
-
const ask = (q) => new Promise(resolve => rl.question(q, resolve));
|
|
2708
|
-
if (!username)
|
|
2709
|
-
username = (await ask("Kaggle username: ")).trim();
|
|
2710
|
-
if (!key)
|
|
2711
|
-
key = (await ask("Kaggle key: ")).trim();
|
|
2712
|
-
rl.close();
|
|
2713
|
-
}
|
|
2714
|
-
if (!username || !key) {
|
|
2715
|
-
console.error("Missing Kaggle username/key. Aborting.");
|
|
2716
|
-
process.exit(1);
|
|
2717
|
-
}
|
|
2718
|
-
secureKeys.set("kaggle_username", username);
|
|
2719
|
-
secureKeys.set("kaggle_key", key);
|
|
2720
|
-
process.env.KAGGLE_USERNAME = username;
|
|
2721
|
-
process.env.KAGGLE_KEY = key;
|
|
2722
|
-
console.log("Key saved securely. You can now use Kaggle datasets.");
|
|
2723
|
-
}
|
|
2724
|
-
async function runDiscoverCli(args) {
|
|
2725
|
-
const getArgValue = (name) => {
|
|
2726
|
-
const idx = args.findIndex(a => a === name);
|
|
2727
|
-
if (idx >= 0 && idx + 1 < args.length)
|
|
2728
|
-
return args[idx + 1];
|
|
2729
|
-
return undefined;
|
|
2730
|
-
};
|
|
2731
|
-
const source = (getArgValue("--source") || "huggingface").toLowerCase();
|
|
2732
|
-
const limit = Number(getArgValue("--limit") || "10");
|
|
2733
|
-
const queryParts = [];
|
|
2734
|
-
for (let i = 0; i < args.length; i++) {
|
|
2735
|
-
const token = args[i];
|
|
2736
|
-
if (token === "discover")
|
|
2737
|
-
continue;
|
|
2738
|
-
if (token === "--source" || token === "--limit") {
|
|
2739
|
-
i += 1;
|
|
2740
|
-
continue;
|
|
2741
|
-
}
|
|
2742
|
-
if (token.startsWith("--"))
|
|
2743
|
-
continue;
|
|
2744
|
-
queryParts.push(token);
|
|
2745
|
-
}
|
|
2746
|
-
const query = queryParts.join(" ").trim();
|
|
2747
|
-
if (!query) {
|
|
2748
|
-
console.error("Usage: vespermcp discover --source <huggingface|kaggle|openml|dataworld> \"credit risk\" --limit 10");
|
|
2749
|
-
process.exit(1);
|
|
2750
|
-
}
|
|
2751
|
-
if (source === "kaggle") {
|
|
2752
|
-
if (!dataIngestor.hasKaggleCredentials()) {
|
|
2753
|
-
console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
|
|
2754
|
-
if (process.stdin.isTTY) {
|
|
2755
|
-
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
2756
|
-
const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
|
|
2757
|
-
rl.close();
|
|
2758
|
-
if (answer.trim().toLowerCase() === "y") {
|
|
2759
|
-
await runConfigCli(["config", "kaggle"]);
|
|
2760
|
-
}
|
|
2761
|
-
}
|
|
2762
|
-
if (!dataIngestor.hasKaggleCredentials())
|
|
2763
|
-
process.exit(1);
|
|
2764
|
-
}
|
|
2765
|
-
try {
|
|
2766
|
-
const results = await kaggleSource.discover(query, limit);
|
|
2767
|
-
console.log(formatSearchResults(results));
|
|
2768
|
-
}
|
|
2769
|
-
catch (error) {
|
|
2770
|
-
const msg = String(error?.message || error);
|
|
2771
|
-
if (msg.toLowerCase().includes("kaggle package not installed")) {
|
|
2772
|
-
console.error("Kaggle support is optional and needs the official client: pip install kaggle");
|
|
2773
|
-
}
|
|
2774
|
-
else {
|
|
2775
|
-
console.error(`Kaggle discover failed: ${msg}`);
|
|
2776
|
-
}
|
|
2777
|
-
process.exit(1);
|
|
2778
|
-
}
|
|
2779
|
-
return;
|
|
2780
|
-
}
|
|
2781
|
-
else if (source === "openml") {
|
|
2782
|
-
try {
|
|
2783
|
-
const openmlSource = new OpenMLSource();
|
|
2784
|
-
const results = await openmlSource.discover(query, limit);
|
|
2785
|
-
console.log(formatSearchResults(results));
|
|
2786
|
-
}
|
|
2787
|
-
catch (error) {
|
|
2788
|
-
console.error(`OpenML discover failed: ${error.message || error}`);
|
|
2789
|
-
process.exit(1);
|
|
2790
|
-
}
|
|
2791
|
-
return;
|
|
2792
|
-
}
|
|
2793
|
-
else if (source === "dataworld") {
|
|
2794
|
-
if (!hasDataWorldToken()) {
|
|
2795
|
-
console.error("data.world requires API token. Run 'vespermcp config keys' and set dataworld_token.");
|
|
2796
|
-
process.exit(1);
|
|
2797
|
-
}
|
|
2798
|
-
try {
|
|
2799
|
-
const dataworldSource = new DataWorldSource();
|
|
2800
|
-
const results = await dataworldSource.discover(query, limit);
|
|
2801
|
-
console.log(formatSearchResults(results));
|
|
2802
|
-
}
|
|
2803
|
-
catch (error) {
|
|
2804
|
-
console.error(`data.world discover failed: ${error.message || error}`);
|
|
2805
|
-
process.exit(1);
|
|
2806
|
-
}
|
|
2807
|
-
return;
|
|
2808
|
-
}
|
|
2809
|
-
const hf = new HuggingFaceScraper();
|
|
2810
|
-
const results = await hf.scrape(limit, true, query);
|
|
2811
|
-
console.log(formatSearchResults(results));
|
|
2812
|
-
}
|
|
2813
|
-
async function runDownloadCli(args) {
|
|
2814
|
-
// Usage: vespermcp download kaggle user/dataset-name [--target-dir C:/path]
|
|
2815
|
-
const targetIdx = args.findIndex(a => a === "--target-dir");
|
|
2816
|
-
const targetDir = targetIdx >= 0 && targetIdx + 1 < args.length ? args[targetIdx + 1] : undefined;
|
|
2817
|
-
const nonFlags = args.filter((a, i) => {
|
|
2818
|
-
if (a.startsWith("--"))
|
|
2819
|
-
return false;
|
|
2820
|
-
if (targetIdx >= 0 && i === targetIdx + 1)
|
|
2821
|
-
return false;
|
|
2822
|
-
return true;
|
|
2823
|
-
});
|
|
2824
|
-
const source = (nonFlags[1] || "").toLowerCase();
|
|
2825
|
-
const datasetId = nonFlags[2] || "";
|
|
2826
|
-
if (!source || !datasetId) {
|
|
2827
|
-
console.error("Usage: vespermcp download <huggingface|kaggle|openml|dataworld> <dataset-id> [--target-dir C:/path]");
|
|
2828
|
-
process.exit(1);
|
|
2829
|
-
}
|
|
2830
|
-
if (source === "kaggle" && !dataIngestor.hasKaggleCredentials()) {
|
|
2831
|
-
console.error("Kaggle support requires API key. Run 'vespermcp config keys' (30 seconds).");
|
|
2832
|
-
if (process.stdin.isTTY) {
|
|
2833
|
-
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
2834
|
-
const answer = await new Promise(resolve => rl.question("Configure Kaggle now? [y/N]: ", resolve));
|
|
2835
|
-
rl.close();
|
|
2836
|
-
if (answer.trim().toLowerCase() === "y") {
|
|
2837
|
-
await runConfigCli(["config", "kaggle"]);
|
|
2838
|
-
}
|
|
2839
|
-
}
|
|
2840
|
-
if (!dataIngestor.hasKaggleCredentials())
|
|
2841
|
-
process.exit(1);
|
|
2842
|
-
}
|
|
2843
|
-
if (source === "dataworld" && !hasDataWorldToken()) {
|
|
2844
|
-
console.error("data.world requires API token. Run 'vespermcp config keys' and set dataworld_token.");
|
|
2845
|
-
process.exit(1);
|
|
2846
|
-
}
|
|
2847
|
-
let localPath = "";
|
|
2848
|
-
try {
|
|
2849
|
-
if (source === "kaggle" && targetDir) {
|
|
2850
|
-
const normalized = datasetId.includes("kaggle.com/datasets/")
|
|
2851
|
-
? datasetId.split("kaggle.com/datasets/")[1].replace(/^\//, "")
|
|
2852
|
-
: datasetId;
|
|
2853
|
-
const dl = await kaggleSource.download(normalized, targetDir);
|
|
2854
|
-
localPath = dl.local_path;
|
|
2855
|
-
const size = fs.existsSync(localPath) ? fs.statSync(localPath).size : 0;
|
|
2856
|
-
metadataStore.registerDownload(normalized, localPath, "completed", size);
|
|
2857
|
-
try {
|
|
2858
|
-
upsertRegistry(datasetId, localPath, "completed");
|
|
2859
|
-
}
|
|
2860
|
-
catch (e) {
|
|
2861
|
-
console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
|
|
2862
|
-
}
|
|
2863
|
-
}
|
|
2864
|
-
else {
|
|
2865
|
-
localPath = await dataIngestor.ensureData(datasetId, source, (msg) => console.log(msg));
|
|
2866
|
-
try {
|
|
2867
|
-
upsertRegistry(datasetId, localPath, "completed");
|
|
2868
|
-
}
|
|
2869
|
-
catch (e) {
|
|
2870
|
-
console.error(`[Registry] Failed to write registry for ${datasetId}: ${e?.message || e}`);
|
|
2871
|
-
}
|
|
2872
|
-
}
|
|
2873
|
-
}
|
|
2874
|
-
catch (error) {
|
|
2875
|
-
const msg = String(error?.message || error);
|
|
2876
|
-
if (source === "kaggle" && msg.toLowerCase().includes("kaggle package not installed")) {
|
|
2877
|
-
console.error("Kaggle support is optional and needs the official client: pip install kaggle");
|
|
2878
|
-
}
|
|
2879
|
-
else {
|
|
2880
|
-
console.error(`Download failed: ${msg}`);
|
|
2881
|
-
}
|
|
2882
|
-
process.exit(1);
|
|
2883
|
-
}
|
|
2884
|
-
console.log(`Download complete: ${localPath}`);
|
|
2885
|
-
}
|
|
2886
|
-
async function runExportCli(args) {
|
|
2887
|
-
const getArgValue = (name) => {
|
|
2888
|
-
const idx = args.findIndex(a => a === name);
|
|
2889
|
-
if (idx >= 0 && idx + 1 < args.length)
|
|
2890
|
-
return args[idx + 1];
|
|
2891
|
-
return undefined;
|
|
2892
|
-
};
|
|
2893
|
-
const nonFlags = args.filter((arg, index) => {
|
|
2894
|
-
if (arg.startsWith("--"))
|
|
2895
|
-
return false;
|
|
2896
|
-
const previous = index > 0 ? args[index - 1] : "";
|
|
2897
|
-
if (["--target-dir", "--format", "--compression", "--sample-rows", "--columns"].includes(previous))
|
|
2898
|
-
return false;
|
|
2899
|
-
return true;
|
|
2900
|
-
});
|
|
2901
|
-
const datasetId = nonFlags[1] || "";
|
|
2902
|
-
if (!datasetId) {
|
|
2903
|
-
console.error("Usage: vespermcp export <dataset-id|local-path> [--format parquet|feather|csv|jsonl|arrow] [--target-dir C:/path] [--compression snappy] [--fast] [--preview] [--sample-rows N] [--columns col1,col2]");
|
|
2904
|
-
process.exit(1);
|
|
2905
|
-
}
|
|
2906
|
-
const requestedFormat = getArgValue("--format") || "parquet";
|
|
2907
|
-
const targetDir = getArgValue("--target-dir");
|
|
2908
|
-
const compression = getArgValue("--compression");
|
|
2909
|
-
const sampleRows = getArgValue("--sample-rows");
|
|
2910
|
-
const columns = getArgValue("--columns");
|
|
2911
|
-
const fastMode = args.includes("--fast");
|
|
2912
|
-
const preview = args.includes("--preview");
|
|
2913
|
-
let sourcePath = resolveDatasetLocalPath(datasetId);
|
|
2914
|
-
if (!sourcePath) {
|
|
2915
|
-
console.error(`Export failed: no local data found for ${datasetId}. Run download or prepare first, or pass a direct local path.`);
|
|
2916
|
-
process.exit(1);
|
|
2917
|
-
}
|
|
2918
|
-
sourcePath = ensureExportableLocalPath(sourcePath);
|
|
2919
|
-
try {
|
|
2920
|
-
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2921
|
-
}
|
|
2922
|
-
catch (e) {
|
|
2923
|
-
console.error(`[Registry] Failed to normalize registry path for ${datasetId}: ${e?.message || e}`);
|
|
2924
|
-
}
|
|
2925
|
-
if (!fastMode) {
|
|
2926
|
-
const currentExt = path.extname(sourcePath).substring(1).toLowerCase();
|
|
2927
|
-
const pipelineFmt = (requestedFormat === "csv" || requestedFormat === "parquet") ? requestedFormat : "parquet";
|
|
2928
|
-
const pipelineCompatibleInput = currentExt === "csv" || currentExt === "parquet";
|
|
2929
|
-
if (pipelineCompatibleInput && currentExt !== pipelineFmt) {
|
|
2930
|
-
try {
|
|
2931
|
-
const pipelineResult = await pipelineExecutor.runPipeline(datasetId, sourcePath, pipelineFmt);
|
|
2932
|
-
if (pipelineResult.final_output_path) {
|
|
2933
|
-
sourcePath = pipelineResult.final_output_path;
|
|
2934
|
-
upsertRegistry(datasetId, sourcePath, "completed");
|
|
2935
|
-
}
|
|
2936
|
-
}
|
|
2937
|
-
catch (err) {
|
|
2938
|
-
console.error(`[Export] Pipeline warning: ${err.message}. Continuing with raw file.`);
|
|
2939
|
-
}
|
|
2940
|
-
}
|
|
2941
|
-
}
|
|
2942
|
-
const exportOpts = {};
|
|
2943
|
-
if (compression)
|
|
2944
|
-
exportOpts.compression = compression;
|
|
2945
|
-
if (preview)
|
|
2946
|
-
exportOpts.preview = true;
|
|
2947
|
-
if (sampleRows)
|
|
2948
|
-
exportOpts.sample_rows = Number(sampleRows);
|
|
2949
|
-
if (columns)
|
|
2950
|
-
exportOpts.columns = columns.split(",").map(col => col.trim()).filter(Boolean);
|
|
2951
|
-
const extMap = { feather: ".feather", parquet: ".parquet", csv: ".csv", jsonl: ".jsonl", arrow: ".arrow", tfrecord: ".tfrecord" };
|
|
2952
|
-
const ext = extMap[requestedFormat] || ".parquet";
|
|
2953
|
-
const safeName = toSafeDatasetPathFragment(datasetId);
|
|
2954
|
-
const outDir = targetDir || process.cwd();
|
|
2955
|
-
if (!fs.existsSync(outDir))
|
|
2956
|
-
fs.mkdirSync(outDir, { recursive: true });
|
|
2957
|
-
const outputFile = path.join(outDir, `${safeName}${ext}`);
|
|
2958
|
-
console.error(`[Export] Resolved output directory: ${outDir}`);
|
|
2959
|
-
console.error(`[Export] Output file: ${outputFile}`);
|
|
2960
|
-
const result = await dataExporter.export(sourcePath, outputFile, requestedFormat, exportOpts);
|
|
2961
|
-
console.log(`Export complete: ${result.output_path}`);
|
|
2962
|
-
console.log(`Format: ${result.format}${result.compression ? ` (${result.compression})` : ""}`);
|
|
2963
|
-
if (result.rows !== undefined)
|
|
2964
|
-
console.log(`Rows: ${result.rows.toLocaleString()}`);
|
|
2965
|
-
if (result.columns !== undefined)
|
|
2966
|
-
console.log(`Columns: ${result.columns}`);
|
|
2967
|
-
if (result.file_size_mb !== undefined)
|
|
2968
|
-
console.log(`Size: ${result.file_size_mb} MB`);
|
|
2969
|
-
if (result.preview_path)
|
|
2970
|
-
console.log(`Preview: ${result.preview_path}`);
|
|
2971
|
-
}
|
|
2972
|
-
async function runFuseCli(args) {
|
|
2973
|
-
const getArgValue = (name) => {
|
|
2974
|
-
const idx = args.findIndex(a => a === name);
|
|
2975
|
-
if (idx >= 0 && idx + 1 < args.length)
|
|
2976
|
-
return args[idx + 1];
|
|
2977
|
-
return undefined;
|
|
2978
|
-
};
|
|
2979
|
-
const collectListAfter = (name) => {
|
|
2980
|
-
const idx = args.findIndex(a => a === name);
|
|
2981
|
-
if (idx < 0)
|
|
2982
|
-
return [];
|
|
2983
|
-
const out = [];
|
|
2984
|
-
for (let i = idx + 1; i < args.length; i++) {
|
|
2985
|
-
if (args[i].startsWith("--"))
|
|
2986
|
-
break;
|
|
2987
|
-
out.push(args[i]);
|
|
2988
|
-
}
|
|
2989
|
-
return out;
|
|
2990
|
-
};
|
|
2991
|
-
const sources = collectListAfter("--sources");
|
|
2992
|
-
if (sources.length < 2) {
|
|
2993
|
-
console.error("Usage: vespermcp fuse --sources <file1> <file2> [more] --strategy concat|join [--on id] [--how inner|left|outer] [--dedup] [--quality] [--leakage] [--format feather|parquet|csv|jsonl|arrow]");
|
|
2994
|
-
process.exit(1);
|
|
2995
|
-
}
|
|
2996
|
-
const strategy = getArgValue("--strategy") || "concat";
|
|
2997
|
-
const onValue = getArgValue("--on");
|
|
2998
|
-
const joinOn = onValue ? onValue.split(",").map(s => s.trim()).filter(Boolean) : undefined;
|
|
2999
|
-
const how = getArgValue("--how") || "inner";
|
|
3000
|
-
const outputFormat = getArgValue("--format") || "feather";
|
|
3001
|
-
const compression = getArgValue("--compression");
|
|
3002
|
-
const outputPath = getArgValue("--output") || path.join(process.cwd(), `fused_${Date.now()}.${outputFormat === "arrow" ? "arrow" : outputFormat}`);
|
|
3003
|
-
const dedup = args.includes("--dedup");
|
|
3004
|
-
const runQualityAfter = args.includes("--quality");
|
|
3005
|
-
const leakageCheck = args.includes("--leakage");
|
|
3006
|
-
const preview = !args.includes("--no-preview");
|
|
3007
|
-
const result = await fusionEngine.fuse(sources, outputPath, {
|
|
3008
|
-
strategy,
|
|
3009
|
-
join_on: joinOn,
|
|
3010
|
-
how,
|
|
3011
|
-
dedup,
|
|
3012
|
-
run_quality_after: runQualityAfter,
|
|
3013
|
-
leakage_check: leakageCheck,
|
|
3014
|
-
output_format: outputFormat,
|
|
3015
|
-
compression,
|
|
3016
|
-
preview,
|
|
3017
|
-
});
|
|
3018
|
-
const nullDelta = result.stats.null_delta;
|
|
3019
|
-
const nullText = nullDelta >= 0 ? `+${nullDelta}%` : `${nullDelta}%`;
|
|
3020
|
-
console.log(`Fused ${result.stats.sources_count} sources → ${result.stats.rows_after.toLocaleString()} rows (from ${result.stats.rows_before.toLocaleString()}).`);
|
|
3021
|
-
console.log(`Null increase: ${nullText}`);
|
|
3022
|
-
console.log(`Output: ${result.output_path}`);
|
|
3023
|
-
if (result.preview_path)
|
|
3024
|
-
console.log(`Preview saved: ${result.preview_path}`);
|
|
3025
|
-
console.log("Next: run vespermcp split/export on the fused dataset");
|
|
3026
|
-
}
|
|
3027
|
-
async function runSetupWizard(silent = false) {
|
|
3028
|
-
if (!silent && process.stdin.isTTY) {
|
|
3029
|
-
const wizardCandidates = [
|
|
3030
|
-
path.join(appRoot, "scripts", "wizard.js"),
|
|
3031
|
-
path.join(appRoot, "src", "scripts", "wizard.js"),
|
|
3032
|
-
path.join(process.cwd(), "vesper-wizard", "wizard.js"),
|
|
3033
|
-
];
|
|
3034
|
-
const wizardScript = wizardCandidates.find(candidate => fs.existsSync(candidate));
|
|
3035
|
-
if (wizardScript) {
|
|
3036
|
-
console.error("[Vesper Setup] Running guided wizard...");
|
|
3037
|
-
const result = spawnSync(process.execPath, [wizardScript], {
|
|
3038
|
-
stdio: "inherit",
|
|
3039
|
-
env: process.env,
|
|
3040
|
-
});
|
|
3041
|
-
if ((result.status ?? 1) !== 0) {
|
|
3042
|
-
console.error("[Vesper Setup] Wizard exited with non-zero status, continuing with automatic MCP config only.");
|
|
3043
|
-
}
|
|
3044
|
-
}
|
|
3045
|
-
}
|
|
3046
|
-
const configManager = new ConfigManager();
|
|
3047
|
-
if (!silent) {
|
|
3048
|
-
console.error(`\nVesper MCP - Universal Setup`);
|
|
3049
|
-
console.error(`================================`);
|
|
3050
|
-
console.error(`Installing to all detected coding agents...\n`);
|
|
3051
|
-
}
|
|
3052
|
-
const result = await runWithSpinner("Installing to detected coding agents", () => configManager.installToAll());
|
|
3053
|
-
if (result.success.length === 0 && result.failed.length === 0) {
|
|
3054
|
-
if (!silent) {
|
|
3055
|
-
console.error("\nNo supported agents detected.");
|
|
3056
|
-
console.error("Supported agents: Claude Code, Claude Desktop, Cursor, VS Code, Codex, Antigravity");
|
|
3057
|
-
console.error("\nMake sure at least one is installed, then try again.");
|
|
3058
|
-
}
|
|
3059
|
-
return;
|
|
3060
|
-
}
|
|
3061
|
-
if (!silent) {
|
|
3062
|
-
console.error("Setup complete! Please RESTART your IDE(s) to apply changes.");
|
|
3063
|
-
}
|
|
3064
|
-
}
|
|
3065
|
-
main().catch((error) => {
|
|
3066
|
-
console.error("Server error:", error);
|
|
3067
|
-
process.exit(1);
|
|
3068
|
-
});
|