vesper-wizard 2.0.5 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/{wizard.js → scripts/wizard.js} +99 -21
  174. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  175. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  179. package/src/python/asset_downloader_engine.py +92 -0
  180. package/src/python/cleaner.py +226 -0
  181. package/src/python/config.py +263 -0
  182. package/src/python/dataworld_engine.py +208 -0
  183. package/src/python/export_engine.py +243 -0
  184. package/src/python/framework_adapters.py +100 -0
  185. package/src/python/fusion_engine.py +368 -0
  186. package/src/python/github_adapter.py +106 -0
  187. package/src/python/hf_fallback.py +298 -0
  188. package/src/python/image_engine.py +86 -0
  189. package/src/python/kaggle_engine.py +295 -0
  190. package/src/python/media_engine.py +133 -0
  191. package/src/python/nasa_adapter.py +82 -0
  192. package/src/python/openml_engine.py +146 -0
  193. package/src/python/quality_engine.py +267 -0
  194. package/src/python/row_count.py +54 -0
  195. package/src/python/splitter_engine.py +283 -0
  196. package/src/python/target_engine.py +154 -0
  197. package/src/python/test_framework_adapters.py +61 -0
  198. package/src/python/test_fusion_engine.py +89 -0
  199. package/src/python/uci_adapter.py +94 -0
  200. package/src/python/vesper/__init__.py +1 -0
  201. package/src/python/vesper/core/__init__.py +1 -0
  202. package/src/python/vesper/core/asset_downloader.py +675 -0
  203. package/src/python/vesper/core/download_recipe.py +104 -0
  204. package/src/python/worldbank_adapter.py +99 -0
  205. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,251 @@
1
+ /**
2
+ * Format job status for visual representation
3
+ */
4
+ export function formatJobStatus(job) {
5
+ const statusMap = {
6
+ "pending": "PENDING",
7
+ "queued": "QUEUED",
8
+ "running": "RUNNING",
9
+ "completed": "COMPLETED",
10
+ "failed": "FAILED",
11
+ "retrying": "RETRYING"
12
+ };
13
+ const statusText = statusMap[job.status] || "UNKNOWN";
14
+ const barWidth = 20;
15
+ const filledWidth = Math.round((job.progress / 100) * barWidth);
16
+ const emptyWidth = barWidth - filledWidth;
17
+ const bar = "ā–ˆ".repeat(filledWidth) + "ā–‘".repeat(emptyWidth);
18
+ let output = `═ Job Status: ${job.type.toUpperCase()} ═\n`;
19
+ output += `ID: ${job.id}\n`;
20
+ output += `Status: ${statusText}\n`;
21
+ output += `Progress: ${bar} ${job.progress}%\n`;
22
+ output += `Activity: ${job.status_text}\n`;
23
+ if (job.status === "running" || job.status === "retrying" || job.status === "queued" || job.status === "pending") {
24
+ output += `Polling hint: check again in 5-10 seconds.\n`;
25
+ }
26
+ else {
27
+ output += `Polling hint: no further polling required.\n`;
28
+ }
29
+ if (job.result_url) {
30
+ output += `\nResult: ${job.result_url}\n`;
31
+ }
32
+ if (job.error) {
33
+ output += `\nERROR:\n`;
34
+ // Format multi-line errors nicely
35
+ const errorLines = job.error.split('\n');
36
+ errorLines.forEach(line => {
37
+ output += ` ${line}\n`;
38
+ });
39
+ output += `\n`;
40
+ }
41
+ output += `Updated: ${new Date(job.updated_at).toLocaleTimeString()}\n`;
42
+ output += "═".repeat(25) + "\n";
43
+ return output;
44
+ }
45
+ /**
46
+ * Format dataset search results for human-readable display
47
+ */
48
+ export function formatSearchResults(results) {
49
+ if (results.length === 0) {
50
+ return "No datasets found matching your query.";
51
+ }
52
+ let output = `Found ${results.length} dataset(s):\n\n`;
53
+ output += "═".repeat(80) + "\n\n";
54
+ results.forEach((ds, index) => {
55
+ const relevanceScore = ds.relevance_score || 0;
56
+ // Source badge and access level
57
+ const openSources = ["huggingface", "openml", "s3", "uci", "github", "worldbank", "nasa"];
58
+ const isOpen = openSources.includes(ds.source);
59
+ const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
60
+ const accessBadge = isOpen ? "Open Access" : "Requires API Key";
61
+ // Safety indicator
62
+ let safetyIndicator = "";
63
+ if (ds.license.category === "safe") {
64
+ safetyIndicator = "Safe";
65
+ }
66
+ else if (ds.license.category === "restricted") {
67
+ safetyIndicator = "Restricted";
68
+ }
69
+ else {
70
+ safetyIndicator = "Unknown License";
71
+ }
72
+ // Header
73
+ output += `${index + 1}. ${ds.name}\n`;
74
+ output += ` Source: ${sourceLabel} | ${accessBadge} | ${safetyIndicator}\n`;
75
+ output += ` Relevance: ${(relevanceScore * 100).toFixed(0)}% | ID: ${ds.id}\n\n`;
76
+ // Description
77
+ if (ds.description && ds.description.length > 0) {
78
+ const shortDesc = ds.description.length > 200
79
+ ? ds.description.substring(0, 200) + "..."
80
+ : ds.description;
81
+ output += ` ${shortDesc}\n\n`;
82
+ }
83
+ // Quality warnings
84
+ if (ds.quality_warnings && ds.quality_warnings.length > 0) {
85
+ output += ` Quality Warnings:\n`;
86
+ ds.quality_warnings.forEach(warning => {
87
+ output += ` • ${warning}\n`;
88
+ });
89
+ output += "\n";
90
+ }
91
+ // Key stats
92
+ output += ` Stats:\n`;
93
+ if (ds.downloads)
94
+ output += ` Downloads: ${ds.downloads.toLocaleString()}\n`;
95
+ if (ds.likes)
96
+ output += ` Likes: ${ds.likes}\n`;
97
+ if (ds.total_examples)
98
+ output += ` Examples: ${ds.total_examples.toLocaleString()}\n`;
99
+ if (ds.total_size_mb)
100
+ output += ` Size: ${ds.total_size_mb} MB\n`;
101
+ output += ` Domain: ${ds.domain || "unknown"}\n`;
102
+ output += ` Task: ${ds.task || "unknown"}\n`;
103
+ // Data splits
104
+ if (ds.splits && ds.splits.length > 0) {
105
+ const splitNames = ds.splits.map(s => s.name).join(", ");
106
+ output += ` Splits: ${splitNames}\n`;
107
+ }
108
+ // License details
109
+ output += `\n License: ${ds.license.id || "Unknown"}\n`;
110
+ if (ds.license.warnings && ds.license.warnings.length > 0) {
111
+ ds.license.warnings.forEach(warning => {
112
+ output += ` WARNING: ${warning}\n`;
113
+ });
114
+ }
115
+ if (ds.license.commercial_use !== undefined) {
116
+ output += ` Commercial use: ${ds.license.commercial_use ? "Yes" : "No"}\n`;
117
+ }
118
+ // Download link
119
+ output += `\n ${ds.download_url}\n`;
120
+ output += "\n" + "─".repeat(80) + "\n\n";
121
+ });
122
+ return output;
123
+ }
124
+ /**
125
+ * Format detailed dataset info
126
+ */
127
+ export function formatDatasetInfo(ds) {
128
+ let output = "";
129
+ // Header
130
+ output += "═".repeat(80) + "\n";
131
+ output += `${ds.name}\n`;
132
+ output += "═".repeat(80) + "\n\n";
133
+ // Source and safety
134
+ const openSources = ["huggingface", "openml", "s3", "uci", "github", "worldbank", "nasa"];
135
+ const isOpen = openSources.includes(ds.source);
136
+ const sourceLabel = ds.source.charAt(0).toUpperCase() + ds.source.slice(1);
137
+ const accessBadge = isOpen ? "Open Access" : "Requires API Key";
138
+ let safetyIndicator = "";
139
+ if (ds.license.category === "safe") {
140
+ safetyIndicator = "Safe for use";
141
+ }
142
+ else if (ds.license.category === "restricted") {
143
+ safetyIndicator = "Restricted - Review license carefully";
144
+ }
145
+ else {
146
+ safetyIndicator = "Unknown license - Use with caution";
147
+ }
148
+ output += `Source: ${sourceLabel} (${accessBadge})\n`;
149
+ output += `Safety: ${safetyIndicator}\n`;
150
+ output += `ID: ${ds.id}\n\n`;
151
+ if (!isOpen && ds.source === "kaggle") {
152
+ output += `NOTE: This dataset uses the Kaggle connector. Vesper can access it through server-managed credentials when configured, otherwise a Kaggle key is still required.\n\n`;
153
+ }
154
+ if (!isOpen && ds.source === "dataworld") {
155
+ output += `NOTE: This dataset uses the data.world connector. Vesper can access it through a server-managed token when configured.\n\n`;
156
+ }
157
+ // Description
158
+ if (ds.description) {
159
+ output += "Description:\n";
160
+ output += `${ds.description}\n\n`;
161
+ }
162
+ // Quality warnings
163
+ if (ds.quality_warnings && ds.quality_warnings.length > 0) {
164
+ output += "Quality Warnings:\n";
165
+ ds.quality_warnings.forEach(warning => {
166
+ output += ` • ${warning}\n`;
167
+ });
168
+ output += "\n";
169
+ }
170
+ // Metadata
171
+ output += "Metadata:\n";
172
+ output += ` Downloads: ${ds.downloads?.toLocaleString() || "N/A"}\n`;
173
+ output += ` Likes: ${ds.likes || 0}\n`;
174
+ output += ` Quality Score: ${ds.quality_score}/100\n`;
175
+ output += ` Domain: ${ds.domain || "unknown"}\n`;
176
+ output += ` Task: ${ds.task || "unknown"}\n`;
177
+ output += ` Languages: ${ds.languages?.join(", ") || "N/A"}\n`;
178
+ output += ` Last Updated: ${new Date(ds.last_updated).toLocaleDateString()}\n\n`;
179
+ // Data characteristics
180
+ output += "Data Characteristics:\n";
181
+ output += ` Total Examples: ${ds.total_examples?.toLocaleString() || "N/A"}\n`;
182
+ output += ` Total Size: ${ds.total_size_mb ? ds.total_size_mb + " MB" : "N/A"}\n`;
183
+ output += ` Structured: ${ds.is_structured ? "Yes" : "No"}\n`;
184
+ output += ` Has Target Column: ${ds.has_target_column ? "Yes" : "No"}\n`;
185
+ output += ` Format: ${ds.format || "N/A"}\n\n`;
186
+ // Splits
187
+ if (ds.splits && ds.splits.length > 0) {
188
+ output += "Data Splits:\n";
189
+ ds.splits.forEach(split => {
190
+ output += ` • ${split.name}: ${split.num_examples?.toLocaleString() || "?"} examples`;
191
+ if (split.size_bytes) {
192
+ output += ` (${(split.size_bytes / (1024 * 1024)).toFixed(2)} MB)`;
193
+ }
194
+ output += "\n";
195
+ });
196
+ output += "\n";
197
+ }
198
+ // Columns
199
+ if (ds.columns && ds.columns.length > 0) {
200
+ output += "Columns:\n";
201
+ ds.columns.slice(0, 10).forEach(col => {
202
+ const targetMarker = col.is_target ? " [TARGET]" : "";
203
+ output += ` • ${col.name}${targetMarker}`;
204
+ if (col.type)
205
+ output += ` (${col.type})`;
206
+ output += "\n";
207
+ });
208
+ if (ds.columns.length > 10) {
209
+ output += ` ... and ${ds.columns.length - 10} more columns\n`;
210
+ }
211
+ output += "\n";
212
+ }
213
+ // License
214
+ output += "License Information:\n";
215
+ output += ` License: ${ds.license.id || "Unknown"}\n`;
216
+ output += ` Category: ${ds.license.category}\n`;
217
+ output += ` Commercial Use: ${ds.license.commercial_use ? "Allowed" : "Not allowed"}\n`;
218
+ if (ds.license.warnings && ds.license.warnings.length > 0) {
219
+ output += ` Warnings:\n`;
220
+ ds.license.warnings.forEach(warning => {
221
+ output += ` WARNING: ${warning}\n`;
222
+ });
223
+ }
224
+ if (ds.license.usage_restrictions && ds.license.usage_restrictions.length > 0) {
225
+ output += ` Restrictions:\n`;
226
+ ds.license.usage_restrictions.forEach(restriction => {
227
+ output += ` • ${restriction}\n`;
228
+ });
229
+ }
230
+ output += "\n";
231
+ // Safety flags
232
+ output += "Safety Flags:\n";
233
+ output += ` Safe Source: ${ds.is_safe_source ? "Yes" : "No"}\n`;
234
+ output += ` Has Personal Data: ${ds.has_personal_data ? "Yes" : "No"}\n`;
235
+ output += ` Paywalled: ${ds.is_paywalled ? "Yes" : "No"}\n`;
236
+ output += ` Scraped Web Data: ${ds.is_scraped_web_data ? "Yes" : "No"}\n\n`;
237
+ // Tags
238
+ if (ds.tags && ds.tags.length > 0) {
239
+ output += "Tags:\n";
240
+ output += ` ${ds.tags.slice(0, 15).join(", ")}`;
241
+ if (ds.tags.length > 15) {
242
+ output += ` ... and ${ds.tags.length - 15} more`;
243
+ }
244
+ output += "\n\n";
245
+ }
246
+ // Download link
247
+ output += "Download:\n";
248
+ output += ` ${ds.download_url}\n\n`;
249
+ output += "═".repeat(80) + "\n";
250
+ return output;
251
+ }
@@ -0,0 +1,52 @@
1
+ import fs from "fs";
2
+ import { Readable } from "stream";
3
+ import { finished } from "stream/promises";
4
+ import { retryWithBackoff } from "../metadata/rate-limiter.js";
5
+ export class RobustDownloader {
6
+ /**
7
+ * Downloads a file with automatic retries and resume support
8
+ */
9
+ async download(url, targetPath, options = {}) {
10
+ await retryWithBackoff(async () => {
11
+ let startByte = 0;
12
+ const headers = { ...(options.headers || {}) };
13
+ // Handle resume logic
14
+ if (options.resume && fs.existsSync(targetPath)) {
15
+ startByte = fs.statSync(targetPath).size;
16
+ if (startByte > 0) {
17
+ headers["Range"] = `bytes=${startByte}-`;
18
+ console.error(`[Downloader] Resuming from byte ${startByte}`);
19
+ }
20
+ }
21
+ const response = await fetch(url, { headers });
22
+ if (response.status === 416) {
23
+ // Requested range not satisfiable - likely already finished
24
+ console.error("[Downloader] Range not satisfiable, file might be complete.");
25
+ return;
26
+ }
27
+ if (!response.ok && response.status !== 206) {
28
+ throw new Error(`Download failed: ${response.statusText} (${response.status})`);
29
+ }
30
+ const contentLength = response.headers.get("content-length");
31
+ const totalSize = (contentLength ? parseInt(contentLength, 10) : 0) + startByte;
32
+ const reader = response.body;
33
+ if (!reader)
34
+ throw new Error("Response body is empty");
35
+ // Open stream in append mode if resuming
36
+ const fileStream = fs.createWriteStream(targetPath, { flags: startByte > 0 ? "a" : "w" });
37
+ const nodeReadable = Readable.fromWeb(reader);
38
+ let downloadedBytes = startByte;
39
+ let lastProgressTime = 0;
40
+ nodeReadable.on("data", (chunk) => {
41
+ downloadedBytes += chunk.length;
42
+ // Throttle progress updates
43
+ const now = Date.now();
44
+ if (options.onProgress && (now - lastProgressTime > 500 || downloadedBytes === totalSize)) {
45
+ options.onProgress(downloadedBytes, totalSize);
46
+ lastProgressTime = now;
47
+ }
48
+ });
49
+ await finished(nodeReadable.pipe(fileStream));
50
+ }, { maxRetries: 5, initialDelay: 2000 });
51
+ }
52
+ }
@@ -0,0 +1,69 @@
1
+ import readline from "readline";
2
+ export class Selector {
3
+ currentIndex = 0;
4
+ options;
5
+ title;
6
+ constructor(title, options) {
7
+ this.title = title;
8
+ this.options = options;
9
+ }
10
+ render() {
11
+ // Clear previous lines
12
+ process.stdout.write("\x1b[?25l"); // Hide cursor
13
+ readline.cursorTo(process.stdout, 0);
14
+ // Clear the lines we used before (options + title + blank line)
15
+ for (let i = 0; i <= this.options.length + 1; i++) {
16
+ readline.clearLine(process.stdout, 0);
17
+ process.stdout.write("\x1b[1A"); // Move up one line
18
+ }
19
+ readline.clearLine(process.stdout, 0);
20
+ console.log(`\n${this.title}`);
21
+ this.options.forEach((opt, idx) => {
22
+ const isCurrent = idx === this.currentIndex;
23
+ const checkbox = opt.selected ? "[\x1b[32mX\x1b[0m]" : "[ ]";
24
+ const cursor = isCurrent ? "\x1b[36m>\x1b[0m " : " ";
25
+ const label = isCurrent ? `\x1b[36m${opt.name}\x1b[0m` : opt.name;
26
+ console.log(`${cursor}${checkbox} ${label}`);
27
+ });
28
+ console.log("\x1b[2m(Use arrows to move, Space to toggle, Enter to confirm)\x1b[0m");
29
+ }
30
+ async run() {
31
+ if (this.options.length === 0)
32
+ return [];
33
+ readline.emitKeypressEvents(process.stdin);
34
+ if (process.stdin.isTTY) {
35
+ process.stdin.setRawMode(true);
36
+ }
37
+ // Initial render room (print blank lines to be cleared)
38
+ console.log("\n".repeat(this.options.length + 1));
39
+ this.render();
40
+ return new Promise((resolve) => {
41
+ const handleKey = (str, key) => {
42
+ if (key.name === "up") {
43
+ this.currentIndex = (this.currentIndex - 1 + this.options.length) % this.options.length;
44
+ this.render();
45
+ }
46
+ else if (key.name === "down") {
47
+ this.currentIndex = (this.currentIndex + 1) % this.options.length;
48
+ this.render();
49
+ }
50
+ else if (key.name === "space") {
51
+ this.options[this.currentIndex].selected = !this.options[this.currentIndex].selected;
52
+ this.render();
53
+ }
54
+ else if (key.name === "return") {
55
+ process.stdin.setRawMode(false);
56
+ process.stdin.removeListener("keypress", handleKey);
57
+ process.stdout.write("\x1b[?25h"); // Show cursor
58
+ console.log("");
59
+ resolve(this.options.filter(o => o.selected).map(o => o.value));
60
+ }
61
+ else if (key.ctrl && key.name === "c") {
62
+ process.stdin.setRawMode(false);
63
+ process.exit();
64
+ }
65
+ };
66
+ process.stdin.on("keypress", handleKey);
67
+ });
68
+ }
69
+ }
@@ -0,0 +1,18 @@
1
+ {
2
+ "mcpServers": {
3
+ "vesper": {
4
+ "command": "npx",
5
+ "args": [
6
+ "-y",
7
+ "-p",
8
+ "@vespermcp/mcp-server@latest",
9
+ "vespermcp"
10
+ ],
11
+ "env": {
12
+ "KAGGLE_USERNAME": "your-kaggle-username",
13
+ "KAGGLE_KEY": "your-kaggle-api-key",
14
+ "HF_TOKEN": "your-huggingface-token"
15
+ }
16
+ }
17
+ }
18
+ }
package/package.json CHANGED
@@ -1,29 +1,101 @@
1
- {
2
- "name": "vesper-wizard",
3
- "version": "2.0.5",
4
- "description": "Zero-friction setup wizard for Vesper — local MCP server, unified dataset API, and agent auto-config in 60 seconds",
5
- "bin": {
6
- "vesper-wizard": "wizard.js"
7
- },
8
- "keywords": [
9
- "vesper",
10
- "mcp",
11
- "wizard",
12
- "setup",
13
- "datasets",
14
- "machine-learning",
15
- "huggingface",
16
- "kaggle",
17
- "openml"
18
- ],
19
- "author": "Vesper Team",
20
- "license": "MIT",
21
- "repository": {
22
- "type": "git",
23
- "url": "https://github.com/vesper/mcp-server"
24
- },
25
- "engines": {
26
- "node": ">=18.0.0"
27
- },
28
- "dependencies": {}
29
- }
1
+ {
2
+ "name": "vesper-wizard",
3
+ "version": "2.0.6",
4
+ "description": "AI-powered dataset discovery, quality analysis, and preparation MCP server with multimodal support (text, image, audio, video)",
5
+ "type": "module",
6
+ "main": "build/index.js",
7
+ "bin": {
8
+ "mcp-server": "./build/index.js",
9
+ "vespermcp": "./build/index.js",
10
+ "vesper-wizard": "scripts/wizard.js"
11
+ },
12
+ "files": [
13
+ "build/**/*",
14
+ "src/python/**/*",
15
+ "scripts/**/*",
16
+ "README.md",
17
+ "LICENSE",
18
+ "mcp-config-template.json"
19
+ ],
20
+ "scripts": {
21
+ "build": "tsc && node -e \"const fs=require('fs');const path=require('path');const src='src/python';const dest='build/python';const walk=(d)=>fs.readdirSync(d,{withFileTypes:true}).flatMap(e=>e.isDirectory()?walk(path.join(d,e.name)):[path.join(d,e.name)]);if(!fs.existsSync(dest))fs.mkdirSync(dest,{recursive:true});if(fs.existsSync(src)){for(const f of walk(src)){if(!f.endsWith('.py'))continue;const rel=path.relative(src,f);const out=path.join(dest,rel);fs.mkdirSync(path.dirname(out),{recursive:true});fs.copyFileSync(f,out);}}console.log('Copied Python scripts to build/python');\"",
22
+ "dev": "tsx watch src/index.ts",
23
+ "postinstall": "node scripts/postinstall.cjs",
24
+ "scrape": "tsx src/scripts/scrape-metadata.ts",
25
+ "massive-scrape": "tsx src/scripts/massive-scrape.ts",
26
+ "index": "tsx src/scripts/build-index.ts",
27
+ "search-cli": "tsx src/scripts/search-cli.ts",
28
+ "check-db": "tsx src/scripts/check-db.ts",
29
+ "test-jit": "tsx src/scripts/test-jit.ts",
30
+ "demo-ui": "tsx src/scripts/demo-ui.ts",
31
+ "fuse": "node build/index.js fuse",
32
+ "discover": "node build/index.js discover",
33
+ "download": "node build/index.js download",
34
+ "export": "node build/index.js export",
35
+ "config": "node build/index.js config",
36
+ "test-fusion-engine": "py src/python/test_fusion_engine.py",
37
+ "setup": "node build/index.js --setup",
38
+ "setup:silent": "node build/index.js --setup --silent",
39
+ "refresh-index": "node scripts/refresh-index.cjs",
40
+ "test": "vitest",
41
+ "start": "node build/index.js"
42
+ },
43
+ "keywords": [
44
+ "mcp",
45
+ "model-context-protocol",
46
+ "dataset",
47
+ "machine-learning",
48
+ "data-quality",
49
+ "huggingface",
50
+ "kaggle",
51
+ "multimodal",
52
+ "image-analysis",
53
+ "audio-analysis",
54
+ "video-analysis",
55
+ "data-preparation",
56
+ "ai",
57
+ "ml"
58
+ ],
59
+ "author": "Vesper Team",
60
+ "license": "MIT",
61
+ "repository": {
62
+ "type": "git",
63
+ "url": "https://github.com/vesper/mcp-server"
64
+ },
65
+ "engines": {
66
+ "node": ">=18.0.0",
67
+ "npm": ">=8.0.0"
68
+ },
69
+ "dependencies": {
70
+ "@huggingface/hub": "^2.7.1",
71
+ "@modelcontextprotocol/sdk": "^1.25.2",
72
+ "@polar-sh/nextjs": "^0.9.4",
73
+ "@supabase/supabase-js": "^2.98.0",
74
+ "@xenova/transformers": "^2.17.2",
75
+ "adm-zip": "^0.5.16",
76
+ "ajv": "^8.17.1",
77
+ "ajv-formats": "^3.0.1",
78
+ "better-sqlite3": "^12.6.0",
79
+ "inquirer": "^13.3.0",
80
+ "lodash": "^4.17.21",
81
+ "uuid": "^13.0.0",
82
+ "zod": "^4.3.5",
83
+ "zod-to-json-schema": "^3.25.1"
84
+ },
85
+ "devDependencies": {
86
+ "@types/adm-zip": "^0.5.7",
87
+ "@types/better-sqlite3": "^7.6.13",
88
+ "@types/lodash": "^4.17.23",
89
+ "@types/node": "^25.0.9",
90
+ "@types/uuid": "^10.0.0",
91
+ "@typescript-eslint/eslint-plugin": "^8.53.0",
92
+ "@typescript-eslint/parser": "^8.53.0",
93
+ "eslint": "^9.39.2",
94
+ "eslint-config-prettier": "^10.1.8",
95
+ "prettier": "^3.8.0",
96
+ "tsx": "^4.21.0",
97
+ "typescript": "^5.9.3",
98
+ "vitest": "^4.0.17"
99
+ },
100
+ "packageManager": "pnpm@10.18.1+sha512.77a884a165cbba2d8d1c19e3b4880eee6d2fcabd0d879121e282196b80042351d5eb3ca0935fa599da1dc51265cc68816ad2bddd2a2de5ea9fdf92adbec7cd34"
101
+ }
@@ -0,0 +1,114 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { execSync } = require('child_process');
4
+ const fs = require('fs');
5
+ const path = require('path');
6
+
7
+ console.log('\nšŸš€ Setting up Vesper MCP Server...\n');
8
+
9
+ // 1. Check for Python
10
+ try {
11
+ execSync('python --version', { stdio: 'pipe' });
12
+ console.log('āœ… Python found');
13
+ } catch (e) {
14
+ console.warn('āš ļø Python not found. Please install Python 3.8+ for full functionality.');
15
+ console.warn(' Image/audio/video analysis features will not work without Python.\n');
16
+ process.exit(0); // Don't fail installation
17
+ }
18
+
19
+ // 2. Install Python dependencies
20
+ console.log('\nšŸ“¦ Installing Python dependencies...');
21
+ const pythonPackages = [
22
+ 'opencv-python',
23
+ 'pillow',
24
+ 'numpy',
25
+ 'librosa',
26
+ 'soundfile',
27
+ 'aiohttp',
28
+ 'aiofiles',
29
+ 'datasets',
30
+ 'webdataset',
31
+ 'kaggle'
32
+ ];
33
+
34
+ try {
35
+ execSync(`python -m pip install ${pythonPackages.join(' ')}`, {
36
+ stdio: 'inherit',
37
+ timeout: 120000 // 2 minutes timeout
38
+ });
39
+ console.log('āœ… Python dependencies installed');
40
+ } catch (e) {
41
+ console.warn('āš ļø Failed to install some Python dependencies.');
42
+ console.warn(' You may need to install them manually:');
43
+ console.warn(` pip install ${pythonPackages.join(' ')}\n`);
44
+ }
45
+
46
+ // 3. Create data directories
47
+ const homeDir = process.env.HOME || process.env.USERPROFILE;
48
+ const vesperDataDir = path.join(homeDir, '.vesper');
49
+ const dirs = [
50
+ vesperDataDir,
51
+ path.join(vesperDataDir, 'data'),
52
+ path.join(vesperDataDir, 'data', 'raw'),
53
+ path.join(vesperDataDir, 'data', 'processed'),
54
+ path.join(vesperDataDir, 'datasets')
55
+ ];
56
+
57
+ dirs.forEach(dir => {
58
+ if (!fs.existsSync(dir)) {
59
+ fs.mkdirSync(dir, { recursive: true });
60
+ }
61
+ });
62
+
63
+ console.log(`āœ… Data directories created at ${vesperDataDir}`);
64
+
65
+ // 4. Auto-configure Claude Desktop (Best Effort)
66
+ console.log('\nāš™ļø Attempting to auto-configure Claude Desktop...');
67
+
68
+ function getClaudeConfigPath() {
69
+ const platform = process.platform;
70
+ const home = process.env.HOME || process.env.USERPROFILE;
71
+
72
+ if (platform === 'win32') {
73
+ return path.join(process.env.APPDATA, 'Claude', 'claude_desktop_config.json');
74
+ } else if (platform === 'darwin') {
75
+ return path.join(home, 'Library', 'Application Support', 'Claude', 'claude_desktop_config.json');
76
+ }
77
+ return null;
78
+ }
79
+
80
+ const configPath = getClaudeConfigPath();
81
+
82
+ if (configPath && fs.existsSync(configPath)) {
83
+ try {
84
+ const configContent = fs.readFileSync(configPath, 'utf8');
85
+ let config = JSON.parse(configContent);
86
+
87
+ if (!config.mcpServers) config.mcpServers = {};
88
+
89
+ if (!config.mcpServers.vesper) {
90
+ config.mcpServers.vesper = {
91
+ command: "vesper",
92
+ args: [],
93
+ env: {
94
+ "HF_TOKEN": ""
95
+ }
96
+ };
97
+
98
+ fs.writeFileSync(configPath, JSON.stringify(config, null, 2));
99
+ console.log(`āœ… Automatically added 'vesper' to ${configPath}`);
100
+ } else {
101
+ console.log(`ā„¹ļø 'vesper' is already configured in ${configPath}`);
102
+ }
103
+ } catch (e) {
104
+ console.warn(`āš ļø Could not auto-configure Claude Desktop: ${e.message}`);
105
+ }
106
+ } else {
107
+ console.log('ā„¹ļø Claude Desktop config not found (skipping auto-config)');
108
+ }
109
+
110
+ console.log('\n✨ Vesper MCP Server installed successfully!\n');
111
+ console.log('šŸ“– Next steps:');
112
+ console.log(' 1. Restart your AI assistant (Cursor/Claude)');
113
+ console.log(' 2. Try: search_datasets(query="sentiment analysis")');
114
+ console.log('\nšŸ’” For full documentation, visit: https://github.com/vesper/mcp-server\n');