vesper-wizard 2.0.5 ā 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +300 -37
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +81 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +62 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +127 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +26 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/config/config-manager.js +221 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +69 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/engine.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/gateway/unified-dataset-gateway.js +409 -0
- package/build/index.js +2704 -0
- package/build/ingestion/hf-downloader.js +171 -0
- package/build/ingestion/ingestor.js +271 -0
- package/build/ingestion/kaggle-downloader.js +102 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +136 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/lib/supabase.js +3 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +377 -0
- package/build/metadata/store.js +340 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/preparation/target-detector.js +75 -0
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +92 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/config.py +263 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/export_engine.py +243 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/fusion_engine.py +368 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/hf_fallback.py +298 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/kaggle_engine.py +295 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/openml_engine.py +146 -0
- package/build/python/quality_engine.py +267 -0
- package/build/python/row_count.py +54 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +675 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +93 -0
- package/build/quality/image-analyzer.js +114 -0
- package/build/quality/media-analyzer.js +115 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +74 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +152 -0
- package/build/search/jit-orchestrator.js +258 -0
- package/build/search/vector-store.js +123 -0
- package/build/splitting/splitter.js +82 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +251 -0
- package/build/utils/downloader.js +52 -0
- package/build/utils/selector.js +69 -0
- package/mcp-config-template.json +18 -0
- package/package.json +101 -29
- package/scripts/postinstall.cjs +114 -0
- package/scripts/preindex_registry.cjs +157 -0
- package/scripts/refresh-index.cjs +87 -0
- package/{wizard.js ā scripts/wizard.js} +99 -21
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +92 -0
- package/src/python/cleaner.py +226 -0
- package/src/python/config.py +263 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/export_engine.py +243 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/fusion_engine.py +368 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/hf_fallback.py +298 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/kaggle_engine.py +295 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/openml_engine.py +146 -0
- package/src/python/quality_engine.py +267 -0
- package/src/python/row_count.py +54 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/target_engine.py +154 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/test_fusion_engine.py +89 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +675 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
- package/src/python/worldbank_adapter.py +99 -0
- package/vesper-mcp-config.json +0 -6
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format job status for visual representation
|
|
3
|
+
*/
|
|
4
|
+
export function formatJobStatus(job) {
|
|
5
|
+
const statusMap = {
|
|
6
|
+
"pending": "PENDING",
|
|
7
|
+
"queued": "QUEUED",
|
|
8
|
+
"running": "RUNNING",
|
|
9
|
+
"completed": "COMPLETED",
|
|
10
|
+
"failed": "FAILED",
|
|
11
|
+
"retrying": "RETRYING"
|
|
12
|
+
};
|
|
13
|
+
const statusText = statusMap[job.status] || "UNKNOWN";
|
|
14
|
+
const barWidth = 20;
|
|
15
|
+
const filledWidth = Math.round((job.progress / 100) * barWidth);
|
|
16
|
+
const emptyWidth = barWidth - filledWidth;
|
|
17
|
+
const bar = "ā".repeat(filledWidth) + "ā".repeat(emptyWidth);
|
|
18
|
+
let output = `ā Job Status: ${job.type.toUpperCase()} ā\n`;
|
|
19
|
+
output += `ID: ${job.id}\n`;
|
|
20
|
+
output += `Status: ${statusText}\n`;
|
|
21
|
+
output += `Progress: ${bar} ${job.progress}%\n`;
|
|
22
|
+
output += `Activity: ${job.status_text}\n`;
|
|
23
|
+
if (job.status === "running" || job.status === "retrying" || job.status === "queued" || job.status === "pending") {
|
|
24
|
+
output += `Polling hint: check again in 5-10 seconds.\n`;
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
output += `Polling hint: no further polling required.\n`;
|
|
28
|
+
}
|
|
29
|
+
if (job.result_url) {
|
|
30
|
+
output += `\nResult: ${job.result_url}\n`;
|
|
31
|
+
}
|
|
32
|
+
if (job.error) {
|
|
33
|
+
output += `\nERROR:\n`;
|
|
34
|
+
// Format multi-line errors nicely
|
|
35
|
+
const errorLines = job.error.split('\n');
|
|
36
|
+
errorLines.forEach(line => {
|
|
37
|
+
output += ` ${line}\n`;
|
|
38
|
+
});
|
|
39
|
+
output += `\n`;
|
|
40
|
+
}
|
|
41
|
+
output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
|
|
42
|
+
output += "ā".repeat(25) + "\n";
|
|
43
|
+
return output;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Format dataset search results for human-readable display
|
|
47
|
+
*/
|
|
48
|
+
export function formatSearchResults(results) {
|
|
49
|
+
if (results.length === 0) {
|
|
50
|
+
return "No datasets found matching your query.";
|
|
51
|
+
}
|
|
52
|
+
let output = `Found ${results.length} dataset(s):\n\n`;
|
|
53
|
+
output += "ā".repeat(80) + "\n\n";
|
|
54
|
+
results.forEach((ds, index) => {
|
|
55
|
+
const relevanceScore = ds.relevance_score || 0;
|
|
56
|
+
// Source badge and access level
|
|
57
|
+
const openSources = ["huggingface", "openml", "s3", "uci", "github", "worldbank", "nasa"];
|
|
58
|
+
const isOpen = openSources.includes(ds.source);
|
|
59
|
+
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
60
|
+
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
61
|
+
// Safety indicator
|
|
62
|
+
let safetyIndicator = "";
|
|
63
|
+
if (ds.license.category === "safe") {
|
|
64
|
+
safetyIndicator = "Safe";
|
|
65
|
+
}
|
|
66
|
+
else if (ds.license.category === "restricted") {
|
|
67
|
+
safetyIndicator = "Restricted";
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
safetyIndicator = "Unknown License";
|
|
71
|
+
}
|
|
72
|
+
// Header
|
|
73
|
+
output += `${index + 1}. ${ds.name}\n`;
|
|
74
|
+
output += ` Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
|
|
75
|
+
output += ` Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
|
|
76
|
+
// Description
|
|
77
|
+
if (ds.description && ds.description.length > 0) {
|
|
78
|
+
const shortDesc = ds.description.length > 200
|
|
79
|
+
? ds.description.substring(0, 200) + "..."
|
|
80
|
+
: ds.description;
|
|
81
|
+
output += ` ${shortDesc}\n\n`;
|
|
82
|
+
}
|
|
83
|
+
// Quality warnings
|
|
84
|
+
if (ds.quality_warnings && ds.quality_warnings.length > 0) {
|
|
85
|
+
output += ` Quality Warnings:\n`;
|
|
86
|
+
ds.quality_warnings.forEach(warning => {
|
|
87
|
+
output += ` ⢠${warning}\n`;
|
|
88
|
+
});
|
|
89
|
+
output += "\n";
|
|
90
|
+
}
|
|
91
|
+
// Key stats
|
|
92
|
+
output += ` Stats:\n`;
|
|
93
|
+
if (ds.downloads)
|
|
94
|
+
output += ` Downloads: ${ds.downloads.toLocaleString()}\n`;
|
|
95
|
+
if (ds.likes)
|
|
96
|
+
output += ` Likes: ${ds.likes}\n`;
|
|
97
|
+
if (ds.total_examples)
|
|
98
|
+
output += ` Examples: ${ds.total_examples.toLocaleString()}\n`;
|
|
99
|
+
if (ds.total_size_mb)
|
|
100
|
+
output += ` Size: ${ds.total_size_mb} MB\n`;
|
|
101
|
+
output += ` Domain: ${ds.domain || "unknown"}\n`;
|
|
102
|
+
output += ` Task: ${ds.task || "unknown"}\n`;
|
|
103
|
+
// Data splits
|
|
104
|
+
if (ds.splits && ds.splits.length > 0) {
|
|
105
|
+
const splitNames = ds.splits.map(s => s.name).join(", ");
|
|
106
|
+
output += ` Splits: ${splitNames}\n`;
|
|
107
|
+
}
|
|
108
|
+
// License details
|
|
109
|
+
output += `\n License: ${ds.license.id || "Unknown"}\n`;
|
|
110
|
+
if (ds.license.warnings && ds.license.warnings.length > 0) {
|
|
111
|
+
ds.license.warnings.forEach(warning => {
|
|
112
|
+
output += ` WARNING: ${warning}\n`;
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
if (ds.license.commercial_use !== undefined) {
|
|
116
|
+
output += ` Commercial use: ${ds.license.commercial_use ? "Yes" : "No"}\n`;
|
|
117
|
+
}
|
|
118
|
+
// Download link
|
|
119
|
+
output += `\n ${ds.download_url}\n`;
|
|
120
|
+
output += "\n" + "ā".repeat(80) + "\n\n";
|
|
121
|
+
});
|
|
122
|
+
return output;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Format detailed dataset info
|
|
126
|
+
*/
|
|
127
|
+
export function formatDatasetInfo(ds) {
|
|
128
|
+
let output = "";
|
|
129
|
+
// Header
|
|
130
|
+
output += "ā".repeat(80) + "\n";
|
|
131
|
+
output += `${ds.name}\n`;
|
|
132
|
+
output += "ā".repeat(80) + "\n\n";
|
|
133
|
+
// Source and safety
|
|
134
|
+
const openSources = ["huggingface", "openml", "s3", "uci", "github", "worldbank", "nasa"];
|
|
135
|
+
const isOpen = openSources.includes(ds.source);
|
|
136
|
+
const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
|
|
137
|
+
const accessBadge = isOpen ? "Open Access" : "Requires API Key";
|
|
138
|
+
let safetyIndicator = "";
|
|
139
|
+
if (ds.license.category === "safe") {
|
|
140
|
+
safetyIndicator = "Safe for use";
|
|
141
|
+
}
|
|
142
|
+
else if (ds.license.category === "restricted") {
|
|
143
|
+
safetyIndicator = "Restricted - Review license carefully";
|
|
144
|
+
}
|
|
145
|
+
else {
|
|
146
|
+
safetyIndicator = "Unknown license - Use with caution";
|
|
147
|
+
}
|
|
148
|
+
output += `Source: ${sourceLabel} (${accessBadge})\n`;
|
|
149
|
+
output += `Safety: ${safetyIndicator}\n`;
|
|
150
|
+
output += `ID: ${ds.id}\n\n`;
|
|
151
|
+
if (!isOpen && ds.source === "kaggle") {
|
|
152
|
+
output += `NOTE: This dataset uses the Kaggle connector. Vesper can access it through server-managed credentials when configured, otherwise a Kaggle key is still required.\n\n`;
|
|
153
|
+
}
|
|
154
|
+
if (!isOpen && ds.source === "dataworld") {
|
|
155
|
+
output += `NOTE: This dataset uses the data.world connector. Vesper can access it through a server-managed token when configured.\n\n`;
|
|
156
|
+
}
|
|
157
|
+
// Description
|
|
158
|
+
if (ds.description) {
|
|
159
|
+
output += "Description:\n";
|
|
160
|
+
output += `${ds.description}\n\n`;
|
|
161
|
+
}
|
|
162
|
+
// Quality warnings
|
|
163
|
+
if (ds.quality_warnings && ds.quality_warnings.length > 0) {
|
|
164
|
+
output += "Quality Warnings:\n";
|
|
165
|
+
ds.quality_warnings.forEach(warning => {
|
|
166
|
+
output += ` ⢠${warning}\n`;
|
|
167
|
+
});
|
|
168
|
+
output += "\n";
|
|
169
|
+
}
|
|
170
|
+
// Metadata
|
|
171
|
+
output += "Metadata:\n";
|
|
172
|
+
output += ` Downloads: ${ds.downloads?.toLocaleString() || "N/A"}\n`;
|
|
173
|
+
output += ` Likes: ${ds.likes || 0}\n`;
|
|
174
|
+
output += ` Quality Score: ${ds.quality_score}/100\n`;
|
|
175
|
+
output += ` Domain: ${ds.domain || "unknown"}\n`;
|
|
176
|
+
output += ` Task: ${ds.task || "unknown"}\n`;
|
|
177
|
+
output += ` Languages: ${ds.languages?.join(", ") || "N/A"}\n`;
|
|
178
|
+
output += ` Last Updated: ${new Date(ds.last_updated).toLocaleDateString()}\n\n`;
|
|
179
|
+
// Data characteristics
|
|
180
|
+
output += "Data Characteristics:\n";
|
|
181
|
+
output += ` Total Examples: ${ds.total_examples?.toLocaleString() || "N/A"}\n`;
|
|
182
|
+
output += ` Total Size: ${ds.total_size_mb ? ds.total_size_mb + " MB" : "N/A"}\n`;
|
|
183
|
+
output += ` Structured: ${ds.is_structured ? "Yes" : "No"}\n`;
|
|
184
|
+
output += ` Has Target Column: ${ds.has_target_column ? "Yes" : "No"}\n`;
|
|
185
|
+
output += ` Format: ${ds.format || "N/A"}\n\n`;
|
|
186
|
+
// Splits
|
|
187
|
+
if (ds.splits && ds.splits.length > 0) {
|
|
188
|
+
output += "Data Splits:\n";
|
|
189
|
+
ds.splits.forEach(split => {
|
|
190
|
+
output += ` ⢠${split.name}: ${split.num_examples?.toLocaleString() || "?"} examples`;
|
|
191
|
+
if (split.size_bytes) {
|
|
192
|
+
output += ` (${(split.size_bytes / (1024 * 1024)).toFixed(2)} MB)`;
|
|
193
|
+
}
|
|
194
|
+
output += "\n";
|
|
195
|
+
});
|
|
196
|
+
output += "\n";
|
|
197
|
+
}
|
|
198
|
+
// Columns
|
|
199
|
+
if (ds.columns && ds.columns.length > 0) {
|
|
200
|
+
output += "Columns:\n";
|
|
201
|
+
ds.columns.slice(0, 10).forEach(col => {
|
|
202
|
+
const targetMarker = col.is_target ? " [TARGET]" : "";
|
|
203
|
+
output += ` ⢠${col.name}${targetMarker}`;
|
|
204
|
+
if (col.type)
|
|
205
|
+
output += ` (${col.type})`;
|
|
206
|
+
output += "\n";
|
|
207
|
+
});
|
|
208
|
+
if (ds.columns.length > 10) {
|
|
209
|
+
output += ` ... and ${ds.columns.length - 10} more columns\n`;
|
|
210
|
+
}
|
|
211
|
+
output += "\n";
|
|
212
|
+
}
|
|
213
|
+
// License
|
|
214
|
+
output += "License Information:\n";
|
|
215
|
+
output += ` License: ${ds.license.id || "Unknown"}\n`;
|
|
216
|
+
output += ` Category: ${ds.license.category}\n`;
|
|
217
|
+
output += ` Commercial Use: ${ds.license.commercial_use ? "Allowed" : "Not allowed"}\n`;
|
|
218
|
+
if (ds.license.warnings && ds.license.warnings.length > 0) {
|
|
219
|
+
output += ` Warnings:\n`;
|
|
220
|
+
ds.license.warnings.forEach(warning => {
|
|
221
|
+
output += ` WARNING: ${warning}\n`;
|
|
222
|
+
});
|
|
223
|
+
}
|
|
224
|
+
if (ds.license.usage_restrictions && ds.license.usage_restrictions.length > 0) {
|
|
225
|
+
output += ` Restrictions:\n`;
|
|
226
|
+
ds.license.usage_restrictions.forEach(restriction => {
|
|
227
|
+
output += ` ⢠${restriction}\n`;
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
output += "\n";
|
|
231
|
+
// Safety flags
|
|
232
|
+
output += "Safety Flags:\n";
|
|
233
|
+
output += ` Safe Source: ${ds.is_safe_source ? "Yes" : "No"}\n`;
|
|
234
|
+
output += ` Has Personal Data: ${ds.has_personal_data ? "Yes" : "No"}\n`;
|
|
235
|
+
output += ` Paywalled: ${ds.is_paywalled ? "Yes" : "No"}\n`;
|
|
236
|
+
output += ` Scraped Web Data: ${ds.is_scraped_web_data ? "Yes" : "No"}\n\n`;
|
|
237
|
+
// Tags
|
|
238
|
+
if (ds.tags && ds.tags.length > 0) {
|
|
239
|
+
output += "Tags:\n";
|
|
240
|
+
output += ` ${ds.tags.slice(0, 15).join(", ")}`;
|
|
241
|
+
if (ds.tags.length > 15) {
|
|
242
|
+
output += ` ... and ${ds.tags.length - 15} more`;
|
|
243
|
+
}
|
|
244
|
+
output += "\n\n";
|
|
245
|
+
}
|
|
246
|
+
// Download link
|
|
247
|
+
output += "Download:\n";
|
|
248
|
+
output += ` ${ds.download_url}\n\n`;
|
|
249
|
+
output += "ā".repeat(80) + "\n";
|
|
250
|
+
return output;
|
|
251
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import { Readable } from "stream";
|
|
3
|
+
import { finished } from "stream/promises";
|
|
4
|
+
import { retryWithBackoff } from "../metadata/rate-limiter.js";
|
|
5
|
+
export class RobustDownloader {
|
|
6
|
+
/**
|
|
7
|
+
* Downloads a file with automatic retries and resume support
|
|
8
|
+
*/
|
|
9
|
+
async download(url, targetPath, options = {}) {
|
|
10
|
+
await retryWithBackoff(async () => {
|
|
11
|
+
let startByte = 0;
|
|
12
|
+
const headers = { ...(options.headers || {}) };
|
|
13
|
+
// Handle resume logic
|
|
14
|
+
if (options.resume && fs.existsSync(targetPath)) {
|
|
15
|
+
startByte = fs.statSync(targetPath).size;
|
|
16
|
+
if (startByte > 0) {
|
|
17
|
+
headers["Range"] = `bytes=${startByte}-`;
|
|
18
|
+
console.error(`[Downloader] Resuming from byte ${startByte}`);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
const response = await fetch(url, { headers });
|
|
22
|
+
if (response.status === 416) {
|
|
23
|
+
// Requested range not satisfiable - likely already finished
|
|
24
|
+
console.error("[Downloader] Range not satisfiable, file might be complete.");
|
|
25
|
+
return;
|
|
26
|
+
}
|
|
27
|
+
if (!response.ok && response.status !== 206) {
|
|
28
|
+
throw new Error(`Download failed: ${response.statusText} (${response.status})`);
|
|
29
|
+
}
|
|
30
|
+
const contentLength = response.headers.get("content-length");
|
|
31
|
+
const totalSize = (contentLength ? parseInt(contentLength, 10) : 0) + startByte;
|
|
32
|
+
const reader = response.body;
|
|
33
|
+
if (!reader)
|
|
34
|
+
throw new Error("Response body is empty");
|
|
35
|
+
// Open stream in append mode if resuming
|
|
36
|
+
const fileStream = fs.createWriteStream(targetPath, { flags: startByte > 0 ? "a" : "w" });
|
|
37
|
+
const nodeReadable = Readable.fromWeb(reader);
|
|
38
|
+
let downloadedBytes = startByte;
|
|
39
|
+
let lastProgressTime = 0;
|
|
40
|
+
nodeReadable.on("data", (chunk) => {
|
|
41
|
+
downloadedBytes += chunk.length;
|
|
42
|
+
// Throttle progress updates
|
|
43
|
+
const now = Date.now();
|
|
44
|
+
if (options.onProgress && (now - lastProgressTime > 500 || downloadedBytes === totalSize)) {
|
|
45
|
+
options.onProgress(downloadedBytes, totalSize);
|
|
46
|
+
lastProgressTime = now;
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
await finished(nodeReadable.pipe(fileStream));
|
|
50
|
+
}, { maxRetries: 5, initialDelay: 2000 });
|
|
51
|
+
}
|
|
52
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import readline from "readline";
|
|
2
|
+
export class Selector {
|
|
3
|
+
currentIndex = 0;
|
|
4
|
+
options;
|
|
5
|
+
title;
|
|
6
|
+
constructor(title, options) {
|
|
7
|
+
this.title = title;
|
|
8
|
+
this.options = options;
|
|
9
|
+
}
|
|
10
|
+
render() {
|
|
11
|
+
// Clear previous lines
|
|
12
|
+
process.stdout.write("\x1b[?25l"); // Hide cursor
|
|
13
|
+
readline.cursorTo(process.stdout, 0);
|
|
14
|
+
// Clear the lines we used before (options + title + blank line)
|
|
15
|
+
for (let i = 0; i <= this.options.length + 1; i++) {
|
|
16
|
+
readline.clearLine(process.stdout, 0);
|
|
17
|
+
process.stdout.write("\x1b[1A"); // Move up one line
|
|
18
|
+
}
|
|
19
|
+
readline.clearLine(process.stdout, 0);
|
|
20
|
+
console.log(`\n${this.title}`);
|
|
21
|
+
this.options.forEach((opt, idx) => {
|
|
22
|
+
const isCurrent = idx === this.currentIndex;
|
|
23
|
+
const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
|
|
24
|
+
const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : " ";
|
|
25
|
+
const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
|
|
26
|
+
console.log(`${cursor}${checkbox} ${label}`);
|
|
27
|
+
});
|
|
28
|
+
console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
|
|
29
|
+
}
|
|
30
|
+
async run() {
|
|
31
|
+
if (this.options.length === 0)
|
|
32
|
+
return [];
|
|
33
|
+
readline.emitKeypressEvents(process.stdin);
|
|
34
|
+
if (process.stdin.isTTY) {
|
|
35
|
+
process.stdin.setRawMode(true);
|
|
36
|
+
}
|
|
37
|
+
// Initial render room (print blank lines to be cleared)
|
|
38
|
+
console.log("\n".repeat(this.options.length + 1));
|
|
39
|
+
this.render();
|
|
40
|
+
return new Promise((resolve) => {
|
|
41
|
+
const handleKey = (str, key) => {
|
|
42
|
+
if (key.name === "up") {
|
|
43
|
+
this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
|
|
44
|
+
this.render();
|
|
45
|
+
}
|
|
46
|
+
else if (key.name === "down") {
|
|
47
|
+
this.currentIndex = (this.currentIndex + 1) % this.options.length;
|
|
48
|
+
this.render();
|
|
49
|
+
}
|
|
50
|
+
else if (key.name === "space") {
|
|
51
|
+
this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
|
|
52
|
+
this.render();
|
|
53
|
+
}
|
|
54
|
+
else if (key.name === "return") {
|
|
55
|
+
process.stdin.setRawMode(false);
|
|
56
|
+
process.stdin.removeListener("keypress", handleKey);
|
|
57
|
+
process.stdout.write("\x1b[?25h"); // Show cursor
|
|
58
|
+
console.log("");
|
|
59
|
+
resolve(this.options.filter(o => o.selected).map(o => o.value));
|
|
60
|
+
}
|
|
61
|
+
else if (key.ctrl && key.name === "c") {
|
|
62
|
+
process.stdin.setRawMode(false);
|
|
63
|
+
process.exit();
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
process.stdin.on("keypress", handleKey);
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"mcpServers": {
|
|
3
|
+
"vesper": {
|
|
4
|
+
"command": "npx",
|
|
5
|
+
"args": [
|
|
6
|
+
"-y",
|
|
7
|
+
"-p",
|
|
8
|
+
"@vespermcp/mcp-server@latest",
|
|
9
|
+
"vespermcp"
|
|
10
|
+
],
|
|
11
|
+
"env": {
|
|
12
|
+
"KAGGLE_USERNAME": "your-kaggle-username",
|
|
13
|
+
"KAGGLE_KEY": "your-kaggle-api-key",
|
|
14
|
+
"HF_TOKEN": "your-huggingface-token"
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
}
|
package/package.json
CHANGED
|
@@ -1,29 +1,101 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "vesper-wizard",
|
|
3
|
-
"version": "2.0.
|
|
4
|
-
"description": "
|
|
5
|
-
"
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
"
|
|
10
|
-
"
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
"
|
|
14
|
-
"
|
|
15
|
-
"
|
|
16
|
-
"
|
|
17
|
-
"
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
"
|
|
21
|
-
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
1
|
+
{
|
|
2
|
+
"name": "vesper-wizard",
|
|
3
|
+
"version": "2.0.6",
|
|
4
|
+
"description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "build/index.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"mcp-server": "./build/index.js",
|
|
9
|
+
"vespermcp": "./build/index.js",
|
|
10
|
+
"vesper-wizard": "scripts/wizard.js"
|
|
11
|
+
},
|
|
12
|
+
"files": [
|
|
13
|
+
"build/**/*",
|
|
14
|
+
"src/python/**/*",
|
|
15
|
+
"scripts/**/*",
|
|
16
|
+
"README.md",
|
|
17
|
+
"LICENSE",
|
|
18
|
+
"mcp-config-template.json"
|
|
19
|
+
],
|
|
20
|
+
"scripts": {
|
|
21
|
+
"build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';const walk=(d)=>fs.readdirSync(d,{withFileTypes:true}).flatMap(e=>e.isDirectory()?walk(path.join(d,e.name)):[path.join(d,e.name)]);if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});if(fs.existsSync(src)){for(const f of walk(src)){if(!f.endsWith('.py'))continue;const rel=path.relative(src,f);const out=path.join(dest,rel);fs.mkdirSync(path.dirname(out),{recursive:true});fs.copyFileSync(f,out);}}console.log('Copied Python scripts to build/python');\"",
|
|
22
|
+
"dev": "tsx watch src/index.ts",
|
|
23
|
+
"postinstall": "node scripts/postinstall.cjs",
|
|
24
|
+
"scrape": "tsx src/scripts/scrape-metadata.ts",
|
|
25
|
+
"massive-scrape": "tsx src/scripts/massive-scrape.ts",
|
|
26
|
+
"index": "tsx src/scripts/build-index.ts",
|
|
27
|
+
"search-cli": "tsx src/scripts/search-cli.ts",
|
|
28
|
+
"check-db": "tsx src/scripts/check-db.ts",
|
|
29
|
+
"test-jit": "tsx src/scripts/test-jit.ts",
|
|
30
|
+
"demo-ui": "tsx src/scripts/demo-ui.ts",
|
|
31
|
+
"fuse": "node build/index.js fuse",
|
|
32
|
+
"discover": "node build/index.js discover",
|
|
33
|
+
"download": "node build/index.js download",
|
|
34
|
+
"export": "node build/index.js export",
|
|
35
|
+
"config": "node build/index.js config",
|
|
36
|
+
"test-fusion-engine": "py src/python/test_fusion_engine.py",
|
|
37
|
+
"setup": "node build/index.js --setup",
|
|
38
|
+
"setup:silent": "node build/index.js --setup --silent",
|
|
39
|
+
"refresh-index": "node scripts/refresh-index.cjs",
|
|
40
|
+
"test": "vitest",
|
|
41
|
+
"start": "node build/index.js"
|
|
42
|
+
},
|
|
43
|
+
"keywords": [
|
|
44
|
+
"mcp",
|
|
45
|
+
"model-context-protocol",
|
|
46
|
+
"dataset",
|
|
47
|
+
"machine-learning",
|
|
48
|
+
"data-quality",
|
|
49
|
+
"huggingface",
|
|
50
|
+
"kaggle",
|
|
51
|
+
"multimodal",
|
|
52
|
+
"image-analysis",
|
|
53
|
+
"audio-analysis",
|
|
54
|
+
"video-analysis",
|
|
55
|
+
"data-preparation",
|
|
56
|
+
"ai",
|
|
57
|
+
"ml"
|
|
58
|
+
],
|
|
59
|
+
"author": "Vesper Team",
|
|
60
|
+
"license": "MIT",
|
|
61
|
+
"repository": {
|
|
62
|
+
"type": "git",
|
|
63
|
+
"url": "https://github.com/vesper/mcp-server"
|
|
64
|
+
},
|
|
65
|
+
"engines": {
|
|
66
|
+
"node": ">=18.0.0",
|
|
67
|
+
"npm": ">=8.0.0"
|
|
68
|
+
},
|
|
69
|
+
"dependencies": {
|
|
70
|
+
"@huggingface/hub": "^2.7.1",
|
|
71
|
+
"@modelcontextprotocol/sdk": "^1.25.2",
|
|
72
|
+
"@polar-sh/nextjs": "^0.9.4",
|
|
73
|
+
"@supabase/supabase-js": "^2.98.0",
|
|
74
|
+
"@xenova/transformers": "^2.17.2",
|
|
75
|
+
"adm-zip": "^0.5.16",
|
|
76
|
+
"ajv": "^8.17.1",
|
|
77
|
+
"ajv-formats": "^3.0.1",
|
|
78
|
+
"better-sqlite3": "^12.6.0",
|
|
79
|
+
"inquirer": "^13.3.0",
|
|
80
|
+
"lodash": "^4.17.21",
|
|
81
|
+
"uuid": "^13.0.0",
|
|
82
|
+
"zod": "^4.3.5",
|
|
83
|
+
"zod-to-json-schema": "^3.25.1"
|
|
84
|
+
},
|
|
85
|
+
"devDependencies": {
|
|
86
|
+
"@types/adm-zip": "^0.5.7",
|
|
87
|
+
"@types/better-sqlite3": "^7.6.13",
|
|
88
|
+
"@types/lodash": "^4.17.23",
|
|
89
|
+
"@types/node": "^25.0.9",
|
|
90
|
+
"@types/uuid": "^10.0.0",
|
|
91
|
+
"@typescript-eslint/eslint-plugin": "^8.53.0",
|
|
92
|
+
"@typescript-eslint/parser": "^8.53.0",
|
|
93
|
+
"eslint": "^9.39.2",
|
|
94
|
+
"eslint-config-prettier": "^10.1.8",
|
|
95
|
+
"prettier": "^3.8.0",
|
|
96
|
+
"tsx": "^4.21.0",
|
|
97
|
+
"typescript": "^5.9.3",
|
|
98
|
+
"vitest": "^4.0.17"
|
|
99
|
+
},
|
|
100
|
+
"packageManager": "pnpm@10.18.1+sha512.77a884a165cbba2d8d1c19e3b4880eee6d2fcabd0d879121e282196b80042351d5eb3ca0935fa599da1dc51265cc68816ad2bddd2a2de5ea9fdf92adbec7cd34"
|
|
101
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const { execSync } = require('child_process');
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const path = require('path');
|
|
6
|
+
|
|
7
|
+
console.log('\nš Setting up Vesper MCP Server...\n');
|
|
8
|
+
|
|
9
|
+
// 1. Check for Python
|
|
10
|
+
try {
|
|
11
|
+
execSync('python --version', { stdio: 'pipe' });
|
|
12
|
+
console.log('ā
Python found');
|
|
13
|
+
} catch (e) {
|
|
14
|
+
console.warn('ā ļø Python not found. Please install Python 3.8+ for full functionality.');
|
|
15
|
+
console.warn(' Image/audio/video analysis features will not work without Python.\n');
|
|
16
|
+
process.exit(0); // Don't fail installation
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// 2. Install Python dependencies
|
|
20
|
+
console.log('\nš¦ Installing Python dependencies...');
|
|
21
|
+
const pythonPackages = [
|
|
22
|
+
'opencv-python',
|
|
23
|
+
'pillow',
|
|
24
|
+
'numpy',
|
|
25
|
+
'librosa',
|
|
26
|
+
'soundfile',
|
|
27
|
+
'aiohttp',
|
|
28
|
+
'aiofiles',
|
|
29
|
+
'datasets',
|
|
30
|
+
'webdataset',
|
|
31
|
+
'kaggle'
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
execSync(`python -m pip install ${pythonPackages.join(' ')}`, {
|
|
36
|
+
stdio: 'inherit',
|
|
37
|
+
timeout: 120000 // 2 minutes timeout
|
|
38
|
+
});
|
|
39
|
+
console.log('ā
Python dependencies installed');
|
|
40
|
+
} catch (e) {
|
|
41
|
+
console.warn('ā ļø Failed to install some Python dependencies.');
|
|
42
|
+
console.warn(' You may need to install them manually:');
|
|
43
|
+
console.warn(` pip install ${pythonPackages.join(' ')}\n`);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// 3. Create data directories
|
|
47
|
+
const homeDir = process.env.HOME || process.env.USERPROFILE;
|
|
48
|
+
const vesperDataDir = path.join(homeDir, '.vesper');
|
|
49
|
+
const dirs = [
|
|
50
|
+
vesperDataDir,
|
|
51
|
+
path.join(vesperDataDir, 'data'),
|
|
52
|
+
path.join(vesperDataDir, 'data', 'raw'),
|
|
53
|
+
path.join(vesperDataDir, 'data', 'processed'),
|
|
54
|
+
path.join(vesperDataDir, 'datasets')
|
|
55
|
+
];
|
|
56
|
+
|
|
57
|
+
dirs.forEach(dir => {
|
|
58
|
+
if (!fs.existsSync(dir)) {
|
|
59
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
60
|
+
}
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
console.log(`ā
Data directories created at ${vesperDataDir}`);
|
|
64
|
+
|
|
65
|
+
// 4. Auto-configure Claude Desktop (Best Effort)
|
|
66
|
+
console.log('\nāļø Attempting to auto-configure Claude Desktop...');
|
|
67
|
+
|
|
68
|
+
function getClaudeConfigPath() {
|
|
69
|
+
const platform = process.platform;
|
|
70
|
+
const home = process.env.HOME || process.env.USERPROFILE;
|
|
71
|
+
|
|
72
|
+
if (platform === 'win32') {
|
|
73
|
+
return path.join(process.env.APPDATA, 'Claude', 'claude_desktop_config.json');
|
|
74
|
+
} else if (platform === 'darwin') {
|
|
75
|
+
return path.join(home, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json');
|
|
76
|
+
}
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const configPath = getClaudeConfigPath();
|
|
81
|
+
|
|
82
|
+
if (configPath && fs.existsSync(configPath)) {
|
|
83
|
+
try {
|
|
84
|
+
const configContent = fs.readFileSync(configPath, 'utf8');
|
|
85
|
+
let config = JSON.parse(configContent);
|
|
86
|
+
|
|
87
|
+
if (!config.mcpServers) config.mcpServers = {};
|
|
88
|
+
|
|
89
|
+
if (!config.mcpServers.vesper) {
|
|
90
|
+
config.mcpServers.vesper = {
|
|
91
|
+
command: "vesper",
|
|
92
|
+
args: [],
|
|
93
|
+
env: {
|
|
94
|
+
"HF_TOKEN": ""
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
|
|
99
|
+
console.log(`ā
Automatically added 'vesper' to ${configPath}`);
|
|
100
|
+
} else {
|
|
101
|
+
console.log(`ā¹ļø 'vesper' is already configured in ${configPath}`);
|
|
102
|
+
}
|
|
103
|
+
} catch (e) {
|
|
104
|
+
console.warn(`ā ļø Could not auto-configure Claude Desktop: ${e.message}`);
|
|
105
|
+
}
|
|
106
|
+
} else {
|
|
107
|
+
console.log('ā¹ļø Claude Desktop config not found (skipping auto-config)');
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
console.log('\n⨠Vesper MCP Server installed successfully!\n');
|
|
111
|
+
console.log('š Next steps:');
|
|
112
|
+
console.log(' 1. Restart your AI assistant (Cursor/Claude)');
|
|
113
|
+
console.log(' 2. Try: search_datasets(query="sentiment analysis")');
|
|
114
|
+
console.log('\nš” For full documentation, visit: https://github.com/vesper/mcp-server\n');
|