vesper-wizard 2.3.1 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -322
- package/package.json +34 -100
- package/vesper-mcp-config.json +6 -0
- package/{scripts/wizard.js → wizard.js} +34 -10
- package/LICENSE +0 -21
- package/build/cache/cdn.js +0 -34
- package/build/cache/service.js +0 -63
- package/build/cleaning/cleaner.js +0 -81
- package/build/cleaning/evaluator.js +0 -89
- package/build/cleaning/executor.js +0 -62
- package/build/cleaning/exporter.js +0 -87
- package/build/cleaning/planner.js +0 -127
- package/build/cleaning/rules.js +0 -57
- package/build/cleaning/types.js +0 -1
- package/build/cloud/adapters/local.js +0 -37
- package/build/cloud/adapters/s3.js +0 -24
- package/build/cloud/adapters/supabase.js +0 -49
- package/build/cloud/storage-manager.js +0 -26
- package/build/cloud/types.js +0 -1
- package/build/compliance/service.js +0 -73
- package/build/compliance/store.js +0 -80
- package/build/compliance/types.js +0 -1
- package/build/config/config-manager.js +0 -221
- package/build/config/secure-keys.js +0 -51
- package/build/config/user-config.js +0 -48
- package/build/data/processing-worker.js +0 -23
- package/build/data/streaming.js +0 -38
- package/build/data/worker-pool.js +0 -39
- package/build/export/exporter.js +0 -82
- package/build/export/packager.js +0 -100
- package/build/export/types.js +0 -1
- package/build/fusion/aligner.js +0 -56
- package/build/fusion/deduplicator.js +0 -69
- package/build/fusion/engine.js +0 -69
- package/build/fusion/harmonizer.js +0 -39
- package/build/fusion/orchestrator.js +0 -86
- package/build/fusion/types.js +0 -1
- package/build/gateway/unified-dataset-gateway.js +0 -410
- package/build/index.js +0 -3068
- package/build/ingestion/hf-downloader.js +0 -171
- package/build/ingestion/ingestor.js +0 -271
- package/build/ingestion/kaggle-downloader.js +0 -102
- package/build/install/install-service.js +0 -46
- package/build/jobs/manager.js +0 -136
- package/build/jobs/queue.js +0 -59
- package/build/jobs/types.js +0 -1
- package/build/lib/supabase.js +0 -3
- package/build/metadata/dataworld-source.js +0 -89
- package/build/metadata/domain.js +0 -147
- package/build/metadata/github-scraper.js +0 -47
- package/build/metadata/institutional-scrapers.js +0 -49
- package/build/metadata/kaggle-scraper.js +0 -182
- package/build/metadata/kaggle-source.js +0 -70
- package/build/metadata/license.js +0 -68
- package/build/metadata/monitoring-service.js +0 -107
- package/build/metadata/monitoring-store.js +0 -78
- package/build/metadata/monitoring-types.js +0 -1
- package/build/metadata/openml-source.js +0 -87
- package/build/metadata/quality.js +0 -48
- package/build/metadata/rate-limiter.js +0 -128
- package/build/metadata/scraper.js +0 -448
- package/build/metadata/store.js +0 -340
- package/build/metadata/types.js +0 -1
- package/build/metadata/uci-scraper.js +0 -49
- package/build/monitoring/observability.js +0 -76
- package/build/preparation/target-detector.js +0 -75
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +0 -94
- package/build/python/cleaner.py +0 -226
- package/build/python/config.py +0 -263
- package/build/python/convert_engine.py +0 -92
- package/build/python/dataworld_engine.py +0 -208
- package/build/python/export_engine.py +0 -288
- package/build/python/framework_adapters.py +0 -100
- package/build/python/fusion_engine.py +0 -368
- package/build/python/github_adapter.py +0 -106
- package/build/python/hf_fallback.py +0 -298
- package/build/python/image_engine.py +0 -86
- package/build/python/kaggle_engine.py +0 -295
- package/build/python/media_engine.py +0 -133
- package/build/python/nasa_adapter.py +0 -82
- package/build/python/normalize_engine.py +0 -83
- package/build/python/openml_engine.py +0 -146
- package/build/python/quality_engine.py +0 -267
- package/build/python/row_count.py +0 -54
- package/build/python/splitter_engine.py +0 -283
- package/build/python/target_engine.py +0 -154
- package/build/python/test_framework_adapters.py +0 -61
- package/build/python/test_fusion_engine.py +0 -89
- package/build/python/uci_adapter.py +0 -94
- package/build/python/vesper/__init__.py +0 -1
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +0 -1
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +0 -679
- package/build/python/vesper/core/download_recipe.py +0 -104
- package/build/python/worldbank_adapter.py +0 -99
- package/build/quality/analyzer.js +0 -93
- package/build/quality/image-analyzer.js +0 -114
- package/build/quality/media-analyzer.js +0 -115
- package/build/quality/quality-orchestrator.js +0 -162
- package/build/quality/types.js +0 -1
- package/build/scripts/build-index.js +0 -54
- package/build/scripts/check-db.js +0 -73
- package/build/scripts/check-jobs.js +0 -24
- package/build/scripts/check-naruto.js +0 -17
- package/build/scripts/cleanup-kaggle.js +0 -41
- package/build/scripts/demo-full-pipeline.js +0 -62
- package/build/scripts/demo-ui.js +0 -58
- package/build/scripts/e2e-demo.js +0 -72
- package/build/scripts/massive-scrape.js +0 -103
- package/build/scripts/ops-dashboard.js +0 -33
- package/build/scripts/repro-bug.js +0 -37
- package/build/scripts/repro-export-bug.js +0 -56
- package/build/scripts/scrape-metadata.js +0 -100
- package/build/scripts/search-cli.js +0 -26
- package/build/scripts/test-bias.js +0 -45
- package/build/scripts/test-caching.js +0 -51
- package/build/scripts/test-cleaning.js +0 -76
- package/build/scripts/test-cloud-storage.js +0 -48
- package/build/scripts/test-compliance.js +0 -58
- package/build/scripts/test-conversion.js +0 -64
- package/build/scripts/test-custom-rules.js +0 -58
- package/build/scripts/test-db-opt.js +0 -63
- package/build/scripts/test-export-custom.js +0 -33
- package/build/scripts/test-exporter.js +0 -53
- package/build/scripts/test-fusion.js +0 -61
- package/build/scripts/test-github.js +0 -27
- package/build/scripts/test-group-split.js +0 -52
- package/build/scripts/test-hf-download.js +0 -29
- package/build/scripts/test-holdout-manager.js +0 -61
- package/build/scripts/test-hybrid-search.js +0 -41
- package/build/scripts/test-image-analysis.js +0 -50
- package/build/scripts/test-ingestion-infra.js +0 -39
- package/build/scripts/test-install.js +0 -40
- package/build/scripts/test-institutional.js +0 -26
- package/build/scripts/test-integrity.js +0 -41
- package/build/scripts/test-jit.js +0 -42
- package/build/scripts/test-job-queue.js +0 -62
- package/build/scripts/test-kaggle-download.js +0 -34
- package/build/scripts/test-large-data.js +0 -50
- package/build/scripts/test-mcp-v5.js +0 -74
- package/build/scripts/test-media-analysis.js +0 -61
- package/build/scripts/test-monitoring.js +0 -91
- package/build/scripts/test-observability.js +0 -106
- package/build/scripts/test-packager.js +0 -55
- package/build/scripts/test-pipeline.js +0 -50
- package/build/scripts/test-planning.js +0 -64
- package/build/scripts/test-privacy.js +0 -38
- package/build/scripts/test-production-sync.js +0 -36
- package/build/scripts/test-quality.js +0 -43
- package/build/scripts/test-robust-ingestion.js +0 -41
- package/build/scripts/test-schema.js +0 -45
- package/build/scripts/test-split-validation.js +0 -40
- package/build/scripts/test-splitter.js +0 -93
- package/build/scripts/test-target-detector.js +0 -29
- package/build/scripts/test-uci.js +0 -27
- package/build/scripts/test-unified-quality.js +0 -86
- package/build/scripts/test-write.js +0 -14
- package/build/scripts/verify-integration.js +0 -57
- package/build/scripts/verify-priority.js +0 -33
- package/build/search/embedder.js +0 -34
- package/build/search/engine.js +0 -190
- package/build/search/jit-orchestrator.js +0 -262
- package/build/search/query-intent.js +0 -509
- package/build/search/vector-store.js +0 -123
- package/build/splitting/splitter.js +0 -82
- package/build/splitting/types.js +0 -1
- package/build/tools/formatter.js +0 -251
- package/build/utils/downloader.js +0 -52
- package/build/utils/python-runtime.js +0 -130
- package/build/utils/selector.js +0 -69
- package/mcp-config-template.json +0 -18
- package/scripts/postinstall.cjs +0 -170
- package/scripts/preindex_registry.cjs +0 -157
- package/scripts/refresh-index.cjs +0 -87
- package/scripts/wizard.cjs +0 -601
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +0 -94
- package/src/python/cleaner.py +0 -226
- package/src/python/config.py +0 -263
- package/src/python/convert_engine.py +0 -92
- package/src/python/dataworld_engine.py +0 -208
- package/src/python/export_engine.py +0 -288
- package/src/python/framework_adapters.py +0 -100
- package/src/python/fusion_engine.py +0 -368
- package/src/python/github_adapter.py +0 -106
- package/src/python/hf_fallback.py +0 -298
- package/src/python/image_engine.py +0 -86
- package/src/python/kaggle_engine.py +0 -295
- package/src/python/media_engine.py +0 -133
- package/src/python/nasa_adapter.py +0 -82
- package/src/python/normalize_engine.py +0 -83
- package/src/python/openml_engine.py +0 -146
- package/src/python/quality_engine.py +0 -267
- package/src/python/requirements.txt +0 -12
- package/src/python/row_count.py +0 -54
- package/src/python/splitter_engine.py +0 -283
- package/src/python/target_engine.py +0 -154
- package/src/python/test_framework_adapters.py +0 -61
- package/src/python/test_fusion_engine.py +0 -89
- package/src/python/uci_adapter.py +0 -94
- package/src/python/vesper/__init__.py +0 -1
- package/src/python/vesper/core/__init__.py +0 -1
- package/src/python/vesper/core/asset_downloader.py +0 -679
- package/src/python/vesper/core/download_recipe.py +0 -104
- package/src/python/worldbank_adapter.py +0 -99
- package/wizard.cjs +0 -3
|
@@ -1,82 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import path from "path";
|
|
3
|
-
import fs from "fs";
|
|
4
|
-
export class DataSplitter {
|
|
5
|
-
pythonPath = "python";
|
|
6
|
-
scriptPath;
|
|
7
|
-
constructor(buildDir = process.cwd()) {
|
|
8
|
-
const homeDir = process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
9
|
-
const dataRoot = path.join(homeDir, ".vesper");
|
|
10
|
-
const scriptPath0 = path.resolve(dataRoot, "python", "splitter_engine.py");
|
|
11
|
-
const scriptPath1 = path.resolve(buildDir, "python", "splitter_engine.py");
|
|
12
|
-
const scriptPath2 = path.resolve(buildDir, "..", "src", "python", "splitter_engine.py");
|
|
13
|
-
const scriptPath3 = path.resolve(buildDir, "..", "python", "splitter_engine.py");
|
|
14
|
-
if (fs.existsSync(scriptPath0)) {
|
|
15
|
-
this.scriptPath = scriptPath0;
|
|
16
|
-
}
|
|
17
|
-
else if (fs.existsSync(scriptPath1)) {
|
|
18
|
-
this.scriptPath = scriptPath1;
|
|
19
|
-
}
|
|
20
|
-
else if (fs.existsSync(scriptPath2)) {
|
|
21
|
-
this.scriptPath = scriptPath2;
|
|
22
|
-
}
|
|
23
|
-
else if (fs.existsSync(scriptPath3)) {
|
|
24
|
-
this.scriptPath = scriptPath3;
|
|
25
|
-
}
|
|
26
|
-
else {
|
|
27
|
-
this.scriptPath = scriptPath0;
|
|
28
|
-
}
|
|
29
|
-
// Detect Python command
|
|
30
|
-
if (process.platform === "win32") {
|
|
31
|
-
this.pythonPath = "py";
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
/**
|
|
35
|
-
* Splits a dataset into Train/Val/Test sets based on config
|
|
36
|
-
*/
|
|
37
|
-
async split(filePath, config) {
|
|
38
|
-
return this.runPython("split", [filePath, JSON.stringify(config)]);
|
|
39
|
-
}
|
|
40
|
-
/**
|
|
41
|
-
* Validates a split for leakage and distribution
|
|
42
|
-
*/
|
|
43
|
-
async validate(paths, options) {
|
|
44
|
-
const config = {
|
|
45
|
-
paths,
|
|
46
|
-
id_column: options?.id_column,
|
|
47
|
-
target_column: options?.target_column
|
|
48
|
-
};
|
|
49
|
-
return this.runPython("validate", [JSON.stringify(config)]);
|
|
50
|
-
}
|
|
51
|
-
async runPython(action, args) {
|
|
52
|
-
return new Promise((resolve, reject) => {
|
|
53
|
-
const process = spawn(this.pythonPath, [this.scriptPath, action, ...args]);
|
|
54
|
-
let stdout = "";
|
|
55
|
-
let stderr = "";
|
|
56
|
-
process.stdout.on("data", (data) => {
|
|
57
|
-
stdout += data.toString();
|
|
58
|
-
});
|
|
59
|
-
process.stderr.on("data", (data) => {
|
|
60
|
-
stderr += data.toString();
|
|
61
|
-
});
|
|
62
|
-
process.on("close", (code) => {
|
|
63
|
-
if (code !== 0) {
|
|
64
|
-
reject(new Error(`Data Splitter (${action}) failed: ${stderr}`));
|
|
65
|
-
return;
|
|
66
|
-
}
|
|
67
|
-
try {
|
|
68
|
-
const result = JSON.parse(stdout);
|
|
69
|
-
if (result.error) {
|
|
70
|
-
reject(new Error(result.error));
|
|
71
|
-
}
|
|
72
|
-
else {
|
|
73
|
-
resolve(result);
|
|
74
|
-
}
|
|
75
|
-
}
|
|
76
|
-
catch (e) {
|
|
77
|
-
reject(new Error(`Failed to parse output: ${stdout}`));
|
|
78
|
-
}
|
|
79
|
-
});
|
|
80
|
-
});
|
|
81
|
-
}
|
|
82
|
-
}
|
package/build/splitting/types.js
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
export {};
|
package/build/tools/formatter.js
DELETED
|
@@ -1,251 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Format job status for visual representation
|
|
3
|
-
*/
|
|
4
|
-
export function formatJobStatus(job) {
|
|
5
|
-
const statusMap = {
|
|
6
|
-
"pending": "PENDING",
|
|
7
|
-
"queued": "QUEUED",
|
|
8
|
-
"running": "RUNNING",
|
|
9
|
-
"completed": "COMPLETED",
|
|
10
|
-
"failed": "FAILED",
|
|
11
|
-
"retrying": "RETRYING"
|
|
12
|
-
};
|
|
13
|
-
const statusText = statusMap[job.status] || "UNKNOWN";
|
|
14
|
-
const barWidth = 20;
|
|
15
|
-
const filledWidth = Math.round((job.progress / 100) * barWidth);
|
|
16
|
-
const emptyWidth = barWidth - filledWidth;
|
|
17
|
-
const bar = "█".repeat(filledWidth) + "░".repeat(emptyWidth);
|
|
18
|
-
let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
|
|
19
|
-
output += `ID: ${job.id}\n`;
|
|
20
|
-
output += `Status: ${statusText}\n`;
|
|
21
|
-
output += `Progress: ${bar} ${job.progress}%\n`;
|
|
22
|
-
output += `Activity: ${job.status_text}\n`;
|
|
23
|
-
if (job.status === "running" || job.status === "retrying" || job.status === "queued" || job.status === "pending") {
|
|
24
|
-
output += `Polling hint: check again in 5-10 seconds.\n`;
|
|
25
|
-
}
|
|
26
|
-
else {
|
|
27
|
-
output += `Polling hint: no further polling required.\n`;
|
|
28
|
-
}
|
|
29
|
-
if (job.result_url) {
|
|
30
|
-
output += `\nResult: ${job.result_url}\n`;
|
|
31
|
-
}
|
|
32
|
-
if (job.error) {
|
|
33
|
-
output += `\nERROR:\n`;
|
|
34
|
-
// Format multi-line errors nicely
|
|
35
|
-
const errorLines = job.error.split('\n');
|
|
36
|
-
errorLines.forEach(line => {
|
|
37
|
-
output += ` ${line}\n`;
|
|
38
|
-
});
|
|
39
|
-
output += `\n`;
|
|
40
|
-
}
|
|
41
|
-
output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
|
|
42
|
-
output += "═".repeat(25) + "\n";
|
|
43
|
-
return output;
|
|
44
|
-
}
|
|
45
|
-
/**
|
|
46
|
-
* Format dataset search results for human-readable display
|
|
47
|
-
*/
|
|
48
|
-
export function formatSearchResults(results) {
|
|
49
|
-
if (results.length === 0) {
|
|
50
|
-
return "No datasets found matching your query.";
|
|
51
|
-
}
|
|
52
|
-
let output = `Found ${results.length} dataset(s):\n\n`;
|
|
53
|
-
output += "═".repeat(80) + "\n\n";
|
|
54
|
-
results.forEach((ds, index) => {
|
|
55
|
-
const relevanceScore = ds.relevance_score || 0;
|
|
56
|
-
// Source badge and access level
|
|
57
|
-
const openSources = ["huggingface", "openml", "s3", "uci", "github", "worldbank", "nasa"];
|
|
58
|
-
const isOpen = openSources.includes(ds.source);
|
|
59
|
-
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
60
|
-
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
61
|
-
// Safety indicator
|
|
62
|
-
let safetyIndicator = "";
|
|
63
|
-
if (ds.license.category === "safe") {
|
|
64
|
-
safetyIndicator = "Safe";
|
|
65
|
-
}
|
|
66
|
-
else if (ds.license.category === "restricted") {
|
|
67
|
-
safetyIndicator = "Restricted";
|
|
68
|
-
}
|
|
69
|
-
else {
|
|
70
|
-
safetyIndicator = "Unknown License";
|
|
71
|
-
}
|
|
72
|
-
// Header
|
|
73
|
-
output += `${index + 1}. ${ds.name}\n`;
|
|
74
|
-
output += ` Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
|
|
75
|
-
output += ` Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
|
|
76
|
-
// Description
|
|
77
|
-
if (ds.description && ds.description.length > 0) {
|
|
78
|
-
const shortDesc = ds.description.length > 200
|
|
79
|
-
? ds.description.substring(0, 200) + "..."
|
|
80
|
-
: ds.description;
|
|
81
|
-
output += ` ${shortDesc}\n\n`;
|
|
82
|
-
}
|
|
83
|
-
// Quality warnings
|
|
84
|
-
if (ds.quality_warnings && ds.quality_warnings.length > 0) {
|
|
85
|
-
output += ` Quality Warnings:\n`;
|
|
86
|
-
ds.quality_warnings.forEach(warning => {
|
|
87
|
-
output += ` • ${warning}\n`;
|
|
88
|
-
});
|
|
89
|
-
output += "\n";
|
|
90
|
-
}
|
|
91
|
-
// Key stats
|
|
92
|
-
output += ` Stats:\n`;
|
|
93
|
-
if (ds.downloads)
|
|
94
|
-
output += ` Downloads: ${ds.downloads.toLocaleString()}\n`;
|
|
95
|
-
if (ds.likes)
|
|
96
|
-
output += ` Likes: ${ds.likes}\n`;
|
|
97
|
-
if (ds.total_examples)
|
|
98
|
-
output += ` Examples: ${ds.total_examples.toLocaleString()}\n`;
|
|
99
|
-
if (ds.total_size_mb)
|
|
100
|
-
output += ` Size: ${ds.total_size_mb} MB\n`;
|
|
101
|
-
output += ` Domain: ${ds.domain || "unknown"}\n`;
|
|
102
|
-
output += ` Task: ${ds.task || "unknown"}\n`;
|
|
103
|
-
// Data splits
|
|
104
|
-
if (ds.splits && ds.splits.length > 0) {
|
|
105
|
-
const splitNames = ds.splits.map(s => s.name).join(", ");
|
|
106
|
-
output += ` Splits: ${splitNames}\n`;
|
|
107
|
-
}
|
|
108
|
-
// License details
|
|
109
|
-
output += `\n License: ${ds.license.id || "Unknown"}\n`;
|
|
110
|
-
if (ds.license.warnings && ds.license.warnings.length > 0) {
|
|
111
|
-
ds.license.warnings.forEach(warning => {
|
|
112
|
-
output += ` WARNING: ${warning}\n`;
|
|
113
|
-
});
|
|
114
|
-
}
|
|
115
|
-
if (ds.license.commercial_use !== undefined) {
|
|
116
|
-
output += ` Commercial use: ${ds.license.commercial_use ? "Yes" : "No"}\n`;
|
|
117
|
-
}
|
|
118
|
-
// Download link
|
|
119
|
-
output += `\n ${ds.download_url}\n`;
|
|
120
|
-
output += "\n" + "─".repeat(80) + "\n\n";
|
|
121
|
-
});
|
|
122
|
-
return output;
|
|
123
|
-
}
|
|
124
|
-
/**
|
|
125
|
-
* Format detailed dataset info
|
|
126
|
-
*/
|
|
127
|
-
export function formatDatasetInfo(ds) {
|
|
128
|
-
let output = "";
|
|
129
|
-
// Header
|
|
130
|
-
output += "═".repeat(80) + "\n";
|
|
131
|
-
output += `${ds.name}\n`;
|
|
132
|
-
output += "═".repeat(80) + "\n\n";
|
|
133
|
-
// Source and safety
|
|
134
|
-
const openSources = ["huggingface", "openml", "s3", "uci", "github", "worldbank", "nasa"];
|
|
135
|
-
const isOpen = openSources.includes(ds.source);
|
|
136
|
-
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
137
|
-
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
138
|
-
let safetyIndicator = "";
|
|
139
|
-
if (ds.license.category === "safe") {
|
|
140
|
-
safetyIndicator = "Safe for use";
|
|
141
|
-
}
|
|
142
|
-
else if (ds.license.category === "restricted") {
|
|
143
|
-
safetyIndicator = "Restricted - Review license carefully";
|
|
144
|
-
}
|
|
145
|
-
else {
|
|
146
|
-
safetyIndicator = "Unknown license - Use with caution";
|
|
147
|
-
}
|
|
148
|
-
output += `Source: ${sourceLabel} (${accessBadge})\n`;
|
|
149
|
-
output += `Safety: ${safetyIndicator}\n`;
|
|
150
|
-
output += `ID: ${ds.id}\n\n`;
|
|
151
|
-
if (!isOpen && ds.source === "kaggle") {
|
|
152
|
-
output += `NOTE: This dataset uses the Kaggle connector. Vesper can access it through server-managed credentials when configured, otherwise a Kaggle key is still required.\n\n`;
|
|
153
|
-
}
|
|
154
|
-
if (!isOpen && ds.source === "dataworld") {
|
|
155
|
-
output += `NOTE: This dataset uses the data.world connector. Vesper can access it through a server-managed token when configured.\n\n`;
|
|
156
|
-
}
|
|
157
|
-
// Description
|
|
158
|
-
if (ds.description) {
|
|
159
|
-
output += "Description:\n";
|
|
160
|
-
output += `${ds.description}\n\n`;
|
|
161
|
-
}
|
|
162
|
-
// Quality warnings
|
|
163
|
-
if (ds.quality_warnings && ds.quality_warnings.length > 0) {
|
|
164
|
-
output += "Quality Warnings:\n";
|
|
165
|
-
ds.quality_warnings.forEach(warning => {
|
|
166
|
-
output += ` • ${warning}\n`;
|
|
167
|
-
});
|
|
168
|
-
output += "\n";
|
|
169
|
-
}
|
|
170
|
-
// Metadata
|
|
171
|
-
output += "Metadata:\n";
|
|
172
|
-
output += ` Downloads: ${ds.downloads?.toLocaleString() || "N/A"}\n`;
|
|
173
|
-
output += ` Likes: ${ds.likes || 0}\n`;
|
|
174
|
-
output += ` Quality Score: ${ds.quality_score}/100\n`;
|
|
175
|
-
output += ` Domain: ${ds.domain || "unknown"}\n`;
|
|
176
|
-
output += ` Task: ${ds.task || "unknown"}\n`;
|
|
177
|
-
output += ` Languages: ${ds.languages?.join(", ") || "N/A"}\n`;
|
|
178
|
-
output += ` Last Updated: ${new Date(ds.last_updated).toLocaleDateString()}\n\n`;
|
|
179
|
-
// Data characteristics
|
|
180
|
-
output += "Data Characteristics:\n";
|
|
181
|
-
output += ` Total Examples: ${ds.total_examples?.toLocaleString() || "N/A"}\n`;
|
|
182
|
-
output += ` Total Size: ${ds.total_size_mb ? ds.total_size_mb + " MB" : "N/A"}\n`;
|
|
183
|
-
output += ` Structured: ${ds.is_structured ? "Yes" : "No"}\n`;
|
|
184
|
-
output += ` Has Target Column: ${ds.has_target_column ? "Yes" : "No"}\n`;
|
|
185
|
-
output += ` Format: ${ds.format || "N/A"}\n\n`;
|
|
186
|
-
// Splits
|
|
187
|
-
if (ds.splits && ds.splits.length > 0) {
|
|
188
|
-
output += "Data Splits:\n";
|
|
189
|
-
ds.splits.forEach(split => {
|
|
190
|
-
output += ` • ${split.name}: ${split.num_examples?.toLocaleString() || "?"} examples`;
|
|
191
|
-
if (split.size_bytes) {
|
|
192
|
-
output += ` (${(split.size_bytes / (1024 * 1024)).toFixed(2)} MB)`;
|
|
193
|
-
}
|
|
194
|
-
output += "\n";
|
|
195
|
-
});
|
|
196
|
-
output += "\n";
|
|
197
|
-
}
|
|
198
|
-
// Columns
|
|
199
|
-
if (ds.columns && ds.columns.length > 0) {
|
|
200
|
-
output += "Columns:\n";
|
|
201
|
-
ds.columns.slice(0, 10).forEach(col => {
|
|
202
|
-
const targetMarker = col.is_target ? " [TARGET]" : "";
|
|
203
|
-
output += ` • ${col.name}${targetMarker}`;
|
|
204
|
-
if (col.type)
|
|
205
|
-
output += ` (${col.type})`;
|
|
206
|
-
output += "\n";
|
|
207
|
-
});
|
|
208
|
-
if (ds.columns.length > 10) {
|
|
209
|
-
output += ` ... and ${ds.columns.length - 10} more columns\n`;
|
|
210
|
-
}
|
|
211
|
-
output += "\n";
|
|
212
|
-
}
|
|
213
|
-
// License
|
|
214
|
-
output += "License Information:\n";
|
|
215
|
-
output += ` License: ${ds.license.id || "Unknown"}\n`;
|
|
216
|
-
output += ` Category: ${ds.license.category}\n`;
|
|
217
|
-
output += ` Commercial Use: ${ds.license.commercial_use ? "Allowed" : "Not allowed"}\n`;
|
|
218
|
-
if (ds.license.warnings && ds.license.warnings.length > 0) {
|
|
219
|
-
output += ` Warnings:\n`;
|
|
220
|
-
ds.license.warnings.forEach(warning => {
|
|
221
|
-
output += ` WARNING: ${warning}\n`;
|
|
222
|
-
});
|
|
223
|
-
}
|
|
224
|
-
if (ds.license.usage_restrictions && ds.license.usage_restrictions.length > 0) {
|
|
225
|
-
output += ` Restrictions:\n`;
|
|
226
|
-
ds.license.usage_restrictions.forEach(restriction => {
|
|
227
|
-
output += ` • ${restriction}\n`;
|
|
228
|
-
});
|
|
229
|
-
}
|
|
230
|
-
output += "\n";
|
|
231
|
-
// Safety flags
|
|
232
|
-
output += "Safety Flags:\n";
|
|
233
|
-
output += ` Safe Source: ${ds.is_safe_source ? "Yes" : "No"}\n`;
|
|
234
|
-
output += ` Has Personal Data: ${ds.has_personal_data ? "Yes" : "No"}\n`;
|
|
235
|
-
output += ` Paywalled: ${ds.is_paywalled ? "Yes" : "No"}\n`;
|
|
236
|
-
output += ` Scraped Web Data: ${ds.is_scraped_web_data ? "Yes" : "No"}\n\n`;
|
|
237
|
-
// Tags
|
|
238
|
-
if (ds.tags && ds.tags.length > 0) {
|
|
239
|
-
output += "Tags:\n";
|
|
240
|
-
output += ` ${ds.tags.slice(0, 15).join(", ")}`;
|
|
241
|
-
if (ds.tags.length > 15) {
|
|
242
|
-
output += ` ... and ${ds.tags.length - 15} more`;
|
|
243
|
-
}
|
|
244
|
-
output += "\n\n";
|
|
245
|
-
}
|
|
246
|
-
// Download link
|
|
247
|
-
output += "Download:\n";
|
|
248
|
-
output += ` ${ds.download_url}\n\n`;
|
|
249
|
-
output += "═".repeat(80) + "\n";
|
|
250
|
-
return output;
|
|
251
|
-
}
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
import fs from "fs";
|
|
2
|
-
import { Readable } from "stream";
|
|
3
|
-
import { finished } from "stream/promises";
|
|
4
|
-
import { retryWithBackoff } from "../metadata/rate-limiter.js";
|
|
5
|
-
export class RobustDownloader {
|
|
6
|
-
/**
|
|
7
|
-
* Downloads a file with automatic retries and resume support
|
|
8
|
-
*/
|
|
9
|
-
async download(url, targetPath, options = {}) {
|
|
10
|
-
await retryWithBackoff(async () => {
|
|
11
|
-
let startByte = 0;
|
|
12
|
-
const headers = { ...(options.headers || {}) };
|
|
13
|
-
// Handle resume logic
|
|
14
|
-
if (options.resume && fs.existsSync(targetPath)) {
|
|
15
|
-
startByte = fs.statSync(targetPath).size;
|
|
16
|
-
if (startByte > 0) {
|
|
17
|
-
headers["Range"] = `bytes=${startByte}-`;
|
|
18
|
-
console.error(`[Downloader] Resuming from byte ${startByte}`);
|
|
19
|
-
}
|
|
20
|
-
}
|
|
21
|
-
const response = await fetch(url, { headers });
|
|
22
|
-
if (response.status === 416) {
|
|
23
|
-
// Requested range not satisfiable - likely already finished
|
|
24
|
-
console.error("[Downloader] Range not satisfiable, file might be complete.");
|
|
25
|
-
return;
|
|
26
|
-
}
|
|
27
|
-
if (!response.ok && response.status !== 206) {
|
|
28
|
-
throw new Error(`Download failed: ${response.statusText} (${response.status})`);
|
|
29
|
-
}
|
|
30
|
-
const contentLength = response.headers.get("content-length");
|
|
31
|
-
const totalSize = (contentLength ? parseInt(contentLength, 10) : 0) + startByte;
|
|
32
|
-
const reader = response.body;
|
|
33
|
-
if (!reader)
|
|
34
|
-
throw new Error("Response body is empty");
|
|
35
|
-
// Open stream in append mode if resuming
|
|
36
|
-
const fileStream = fs.createWriteStream(targetPath, { flags: startByte > 0 ? "a" : "w" });
|
|
37
|
-
const nodeReadable = Readable.fromWeb(reader);
|
|
38
|
-
let downloadedBytes = startByte;
|
|
39
|
-
let lastProgressTime = 0;
|
|
40
|
-
nodeReadable.on("data", (chunk) => {
|
|
41
|
-
downloadedBytes += chunk.length;
|
|
42
|
-
// Throttle progress updates
|
|
43
|
-
const now = Date.now();
|
|
44
|
-
if (options.onProgress && (now - lastProgressTime > 500 || downloadedBytes === totalSize)) {
|
|
45
|
-
options.onProgress(downloadedBytes, totalSize);
|
|
46
|
-
lastProgressTime = now;
|
|
47
|
-
}
|
|
48
|
-
});
|
|
49
|
-
await finished(nodeReadable.pipe(fileStream));
|
|
50
|
-
}, { maxRetries: 5, initialDelay: 2000 });
|
|
51
|
-
}
|
|
52
|
-
}
|
|
@@ -1,130 +0,0 @@
|
|
|
1
|
-
import { spawn } from "child_process";
|
|
2
|
-
import fs from "fs";
|
|
3
|
-
import os from "os";
|
|
4
|
-
import path from "path";
|
|
5
|
-
function getHomeDir(buildDir) {
|
|
6
|
-
return os.homedir() || process.env.HOME || process.env.USERPROFILE || buildDir;
|
|
7
|
-
}
|
|
8
|
-
export function getVesperDataRoot(buildDir = process.cwd()) {
|
|
9
|
-
return path.join(getHomeDir(buildDir), ".vesper");
|
|
10
|
-
}
|
|
11
|
-
export function getManagedPythonPath(buildDir = process.cwd()) {
|
|
12
|
-
const dataRoot = getVesperDataRoot(buildDir);
|
|
13
|
-
return process.platform === "win32"
|
|
14
|
-
? path.join(dataRoot, ".venv", "Scripts", "python.exe")
|
|
15
|
-
: path.join(dataRoot, ".venv", "bin", "python");
|
|
16
|
-
}
|
|
17
|
-
function getFallbackPythonCommand() {
|
|
18
|
-
return process.platform === "win32" ? "py" : "python3";
|
|
19
|
-
}
|
|
20
|
-
export function resolvePythonCommand(buildDir = process.cwd()) {
|
|
21
|
-
const managedPython = getManagedPythonPath(buildDir);
|
|
22
|
-
if (fs.existsSync(managedPython)) {
|
|
23
|
-
return managedPython;
|
|
24
|
-
}
|
|
25
|
-
const envPython = process.env.VESPER_PYTHON;
|
|
26
|
-
if (envPython) {
|
|
27
|
-
return envPython;
|
|
28
|
-
}
|
|
29
|
-
const localCandidates = process.platform === "win32"
|
|
30
|
-
? [
|
|
31
|
-
path.resolve(buildDir, ".venv", "Scripts", "python.exe"),
|
|
32
|
-
path.resolve(buildDir, "..", ".venv", "Scripts", "python.exe")
|
|
33
|
-
]
|
|
34
|
-
: [
|
|
35
|
-
path.resolve(buildDir, ".venv", "bin", "python"),
|
|
36
|
-
path.resolve(buildDir, "..", ".venv", "bin", "python")
|
|
37
|
-
];
|
|
38
|
-
for (const candidate of localCandidates) {
|
|
39
|
-
if (fs.existsSync(candidate)) {
|
|
40
|
-
return candidate;
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
return getFallbackPythonCommand();
|
|
44
|
-
}
|
|
45
|
-
function runPythonCommand(pythonPath, args, timeoutMs = 300000) {
|
|
46
|
-
return new Promise((resolve, reject) => {
|
|
47
|
-
const proc = spawn(pythonPath, args, {
|
|
48
|
-
env: {
|
|
49
|
-
...process.env,
|
|
50
|
-
PYTHONIOENCODING: "utf-8",
|
|
51
|
-
},
|
|
52
|
-
});
|
|
53
|
-
let stdout = "";
|
|
54
|
-
let stderr = "";
|
|
55
|
-
const timer = setTimeout(() => {
|
|
56
|
-
proc.kill();
|
|
57
|
-
resolve({ code: 124, stdout, stderr: stderr || `Python command timed out after ${timeoutMs}ms` });
|
|
58
|
-
}, timeoutMs);
|
|
59
|
-
proc.stdout.on("data", (data) => {
|
|
60
|
-
stdout += data.toString();
|
|
61
|
-
});
|
|
62
|
-
proc.stderr.on("data", (data) => {
|
|
63
|
-
stderr += data.toString();
|
|
64
|
-
});
|
|
65
|
-
proc.on("close", (code) => {
|
|
66
|
-
clearTimeout(timer);
|
|
67
|
-
resolve({ code: code ?? 1, stdout, stderr });
|
|
68
|
-
});
|
|
69
|
-
proc.on("error", (error) => {
|
|
70
|
-
clearTimeout(timer);
|
|
71
|
-
reject(error);
|
|
72
|
-
});
|
|
73
|
-
});
|
|
74
|
-
}
|
|
75
|
-
async function createManagedPythonEnv(buildDir) {
|
|
76
|
-
const dataRoot = getVesperDataRoot(buildDir);
|
|
77
|
-
const venvDir = path.join(dataRoot, ".venv");
|
|
78
|
-
const managedPython = getManagedPythonPath(buildDir);
|
|
79
|
-
if (fs.existsSync(managedPython)) {
|
|
80
|
-
return managedPython;
|
|
81
|
-
}
|
|
82
|
-
fs.mkdirSync(dataRoot, { recursive: true });
|
|
83
|
-
const bootstrapAttempts = process.platform === "win32"
|
|
84
|
-
? [
|
|
85
|
-
{ command: "py", args: ["-3", "-m", "venv", venvDir] },
|
|
86
|
-
{ command: "python", args: ["-m", "venv", venvDir] },
|
|
87
|
-
]
|
|
88
|
-
: [
|
|
89
|
-
{ command: "python3", args: ["-m", "venv", venvDir] },
|
|
90
|
-
{ command: "python", args: ["-m", "venv", venvDir] },
|
|
91
|
-
];
|
|
92
|
-
let lastError = "";
|
|
93
|
-
for (const attempt of bootstrapAttempts) {
|
|
94
|
-
try {
|
|
95
|
-
const result = await runPythonCommand(attempt.command, attempt.args, 180000);
|
|
96
|
-
if (result.code === 0 && fs.existsSync(managedPython)) {
|
|
97
|
-
await runPythonCommand(managedPython, ["-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "pip"], 300000);
|
|
98
|
-
return managedPython;
|
|
99
|
-
}
|
|
100
|
-
lastError = (result.stderr || result.stdout || "Unknown venv creation error").trim();
|
|
101
|
-
}
|
|
102
|
-
catch (error) {
|
|
103
|
-
lastError = error?.message || String(error);
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
throw new Error(`Failed to create Vesper Python environment. ${lastError}`.trim());
|
|
107
|
-
}
|
|
108
|
-
export async function ensurePythonPackages(buildDir, requirements) {
|
|
109
|
-
const pythonPath = await createManagedPythonEnv(buildDir).catch(() => resolvePythonCommand(buildDir));
|
|
110
|
-
const missing = [];
|
|
111
|
-
for (const requirement of requirements) {
|
|
112
|
-
const check = await runPythonCommand(pythonPath, [
|
|
113
|
-
"-c",
|
|
114
|
-
`import importlib.util,sys; sys.exit(0 if importlib.util.find_spec(${JSON.stringify(requirement.module)}) else 1)`
|
|
115
|
-
], 20000);
|
|
116
|
-
if (check.code !== 0) {
|
|
117
|
-
missing.push(requirement);
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
if (missing.length === 0) {
|
|
121
|
-
return pythonPath;
|
|
122
|
-
}
|
|
123
|
-
const packages = [...new Set(missing.map(requirement => requirement.packageName))];
|
|
124
|
-
const install = await runPythonCommand(pythonPath, ["-m", "pip", "install", "--disable-pip-version-check", ...packages], 600000);
|
|
125
|
-
if (install.code !== 0) {
|
|
126
|
-
const details = (install.stderr || install.stdout || "Unknown pip install error").trim();
|
|
127
|
-
throw new Error(`Failed to install Python packages (${packages.join(", ")}). ${details}`);
|
|
128
|
-
}
|
|
129
|
-
return pythonPath;
|
|
130
|
-
}
|
package/build/utils/selector.js
DELETED
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import readline from "readline";
|
|
2
|
-
export class Selector {
|
|
3
|
-
currentIndex = 0;
|
|
4
|
-
options;
|
|
5
|
-
title;
|
|
6
|
-
constructor(title, options) {
|
|
7
|
-
this.title = title;
|
|
8
|
-
this.options = options;
|
|
9
|
-
}
|
|
10
|
-
render() {
|
|
11
|
-
// Clear previous lines
|
|
12
|
-
process.stdout.write("\x1b[?25l"); // Hide cursor
|
|
13
|
-
readline.cursorTo(process.stdout, 0);
|
|
14
|
-
// Clear the lines we used before (options + title + blank line)
|
|
15
|
-
for (let i = 0; i <= this.options.length + 1; i++) {
|
|
16
|
-
readline.clearLine(process.stdout, 0);
|
|
17
|
-
process.stdout.write("\x1b[1A"); // Move up one line
|
|
18
|
-
}
|
|
19
|
-
readline.clearLine(process.stdout, 0);
|
|
20
|
-
console.log(`\n${this.title}`);
|
|
21
|
-
this.options.forEach((opt, idx) => {
|
|
22
|
-
const isCurrent = idx === this.currentIndex;
|
|
23
|
-
const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
|
|
24
|
-
const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : " ";
|
|
25
|
-
const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
|
|
26
|
-
console.log(`${cursor}${checkbox} ${label}`);
|
|
27
|
-
});
|
|
28
|
-
console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
|
|
29
|
-
}
|
|
30
|
-
async run() {
|
|
31
|
-
if (this.options.length === 0)
|
|
32
|
-
return [];
|
|
33
|
-
readline.emitKeypressEvents(process.stdin);
|
|
34
|
-
if (process.stdin.isTTY) {
|
|
35
|
-
process.stdin.setRawMode(true);
|
|
36
|
-
}
|
|
37
|
-
// Initial render room (print blank lines to be cleared)
|
|
38
|
-
console.log("\n".repeat(this.options.length + 1));
|
|
39
|
-
this.render();
|
|
40
|
-
return new Promise((resolve) => {
|
|
41
|
-
const handleKey = (str, key) => {
|
|
42
|
-
if (key.name === "up") {
|
|
43
|
-
this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
|
|
44
|
-
this.render();
|
|
45
|
-
}
|
|
46
|
-
else if (key.name === "down") {
|
|
47
|
-
this.currentIndex = (this.currentIndex + 1) % this.options.length;
|
|
48
|
-
this.render();
|
|
49
|
-
}
|
|
50
|
-
else if (key.name === "space") {
|
|
51
|
-
this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
|
|
52
|
-
this.render();
|
|
53
|
-
}
|
|
54
|
-
else if (key.name === "return") {
|
|
55
|
-
process.stdin.setRawMode(false);
|
|
56
|
-
process.stdin.removeListener("keypress", handleKey);
|
|
57
|
-
process.stdout.write("\x1b[?25h"); // Show cursor
|
|
58
|
-
console.log("");
|
|
59
|
-
resolve(this.options.filter(o => o.selected).map(o => o.value));
|
|
60
|
-
}
|
|
61
|
-
else if (key.ctrl && key.name === "c") {
|
|
62
|
-
process.stdin.setRawMode(false);
|
|
63
|
-
process.exit();
|
|
64
|
-
}
|
|
65
|
-
};
|
|
66
|
-
process.stdin.on("keypress", handleKey);
|
|
67
|
-
});
|
|
68
|
-
}
|
|
69
|
-
}
|
package/mcp-config-template.json
DELETED
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"mcpServers": {
|
|
3
|
-
"vesper": {
|
|
4
|
-
"command": "npx",
|
|
5
|
-
"args": [
|
|
6
|
-
"-y",
|
|
7
|
-
"-p",
|
|
8
|
-
"@vespermcp/mcp-server@latest",
|
|
9
|
-
"vespermcp"
|
|
10
|
-
],
|
|
11
|
-
"env": {
|
|
12
|
-
"KAGGLE_USERNAME": "your-kaggle-username",
|
|
13
|
-
"KAGGLE_KEY": "your-kaggle-api-key",
|
|
14
|
-
"HF_TOKEN": "your-huggingface-token"
|
|
15
|
-
}
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
}
|