vesper-wizard 2.0.4 → 2.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +300 -37
  3. package/build/cache/cdn.js +34 -0
  4. package/build/cache/service.js +63 -0
  5. package/build/cleaning/cleaner.js +81 -0
  6. package/build/cleaning/evaluator.js +89 -0
  7. package/build/cleaning/executor.js +62 -0
  8. package/build/cleaning/exporter.js +87 -0
  9. package/build/cleaning/planner.js +127 -0
  10. package/build/cleaning/rules.js +57 -0
  11. package/build/cleaning/types.js +1 -0
  12. package/build/cloud/adapters/local.js +37 -0
  13. package/build/cloud/adapters/s3.js +24 -0
  14. package/build/cloud/adapters/supabase.js +49 -0
  15. package/build/cloud/storage-manager.js +26 -0
  16. package/build/cloud/types.js +1 -0
  17. package/build/compliance/service.js +73 -0
  18. package/build/compliance/store.js +80 -0
  19. package/build/compliance/types.js +1 -0
  20. package/build/config/config-manager.js +221 -0
  21. package/build/config/secure-keys.js +51 -0
  22. package/build/config/user-config.js +48 -0
  23. package/build/data/processing-worker.js +23 -0
  24. package/build/data/streaming.js +38 -0
  25. package/build/data/worker-pool.js +39 -0
  26. package/build/export/exporter.js +69 -0
  27. package/build/export/packager.js +100 -0
  28. package/build/export/types.js +1 -0
  29. package/build/fusion/aligner.js +56 -0
  30. package/build/fusion/deduplicator.js +69 -0
  31. package/build/fusion/engine.js +69 -0
  32. package/build/fusion/harmonizer.js +39 -0
  33. package/build/fusion/orchestrator.js +86 -0
  34. package/build/fusion/types.js +1 -0
  35. package/build/gateway/unified-dataset-gateway.js +409 -0
  36. package/build/index.js +2704 -0
  37. package/build/ingestion/hf-downloader.js +171 -0
  38. package/build/ingestion/ingestor.js +271 -0
  39. package/build/ingestion/kaggle-downloader.js +102 -0
  40. package/build/install/install-service.js +41 -0
  41. package/build/jobs/manager.js +136 -0
  42. package/build/jobs/queue.js +59 -0
  43. package/build/jobs/types.js +1 -0
  44. package/build/lib/supabase.js +3 -0
  45. package/build/metadata/dataworld-source.js +89 -0
  46. package/build/metadata/domain.js +147 -0
  47. package/build/metadata/github-scraper.js +47 -0
  48. package/build/metadata/institutional-scrapers.js +49 -0
  49. package/build/metadata/kaggle-scraper.js +182 -0
  50. package/build/metadata/kaggle-source.js +70 -0
  51. package/build/metadata/license.js +68 -0
  52. package/build/metadata/monitoring-service.js +107 -0
  53. package/build/metadata/monitoring-store.js +78 -0
  54. package/build/metadata/monitoring-types.js +1 -0
  55. package/build/metadata/openml-source.js +87 -0
  56. package/build/metadata/quality.js +48 -0
  57. package/build/metadata/rate-limiter.js +128 -0
  58. package/build/metadata/scraper.js +377 -0
  59. package/build/metadata/store.js +340 -0
  60. package/build/metadata/types.js +1 -0
  61. package/build/metadata/uci-scraper.js +49 -0
  62. package/build/monitoring/observability.js +76 -0
  63. package/build/preparation/target-detector.js +75 -0
  64. package/build/python/__pycache__/config.cpython-312.pyc +0 -0
  65. package/build/python/asset_downloader_engine.py +92 -0
  66. package/build/python/cleaner.py +226 -0
  67. package/build/python/config.py +263 -0
  68. package/build/python/dataworld_engine.py +208 -0
  69. package/build/python/export_engine.py +243 -0
  70. package/build/python/framework_adapters.py +100 -0
  71. package/build/python/fusion_engine.py +368 -0
  72. package/build/python/github_adapter.py +106 -0
  73. package/build/python/hf_fallback.py +298 -0
  74. package/build/python/image_engine.py +86 -0
  75. package/build/python/kaggle_engine.py +295 -0
  76. package/build/python/media_engine.py +133 -0
  77. package/build/python/nasa_adapter.py +82 -0
  78. package/build/python/openml_engine.py +146 -0
  79. package/build/python/quality_engine.py +267 -0
  80. package/build/python/row_count.py +54 -0
  81. package/build/python/splitter_engine.py +283 -0
  82. package/build/python/target_engine.py +154 -0
  83. package/build/python/test_framework_adapters.py +61 -0
  84. package/build/python/test_fusion_engine.py +89 -0
  85. package/build/python/uci_adapter.py +94 -0
  86. package/build/python/vesper/__init__.py +1 -0
  87. package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
  88. package/build/python/vesper/core/__init__.py +1 -0
  89. package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
  90. package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
  91. package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
  92. package/build/python/vesper/core/asset_downloader.py +675 -0
  93. package/build/python/vesper/core/download_recipe.py +104 -0
  94. package/build/python/worldbank_adapter.py +99 -0
  95. package/build/quality/analyzer.js +93 -0
  96. package/build/quality/image-analyzer.js +114 -0
  97. package/build/quality/media-analyzer.js +115 -0
  98. package/build/quality/quality-orchestrator.js +162 -0
  99. package/build/quality/types.js +1 -0
  100. package/build/scripts/build-index.js +54 -0
  101. package/build/scripts/check-db.js +73 -0
  102. package/build/scripts/check-jobs.js +24 -0
  103. package/build/scripts/check-naruto.js +17 -0
  104. package/build/scripts/cleanup-kaggle.js +41 -0
  105. package/build/scripts/demo-full-pipeline.js +62 -0
  106. package/build/scripts/demo-ui.js +58 -0
  107. package/build/scripts/e2e-demo.js +72 -0
  108. package/build/scripts/massive-scrape.js +103 -0
  109. package/build/scripts/ops-dashboard.js +33 -0
  110. package/build/scripts/repro-bug.js +37 -0
  111. package/build/scripts/repro-export-bug.js +56 -0
  112. package/build/scripts/scrape-metadata.js +100 -0
  113. package/build/scripts/search-cli.js +26 -0
  114. package/build/scripts/test-bias.js +45 -0
  115. package/build/scripts/test-caching.js +51 -0
  116. package/build/scripts/test-cleaning.js +76 -0
  117. package/build/scripts/test-cloud-storage.js +48 -0
  118. package/build/scripts/test-compliance.js +58 -0
  119. package/build/scripts/test-conversion.js +64 -0
  120. package/build/scripts/test-custom-rules.js +58 -0
  121. package/build/scripts/test-db-opt.js +63 -0
  122. package/build/scripts/test-export-custom.js +33 -0
  123. package/build/scripts/test-exporter.js +53 -0
  124. package/build/scripts/test-fusion.js +61 -0
  125. package/build/scripts/test-github.js +27 -0
  126. package/build/scripts/test-group-split.js +52 -0
  127. package/build/scripts/test-hf-download.js +29 -0
  128. package/build/scripts/test-holdout-manager.js +61 -0
  129. package/build/scripts/test-hybrid-search.js +41 -0
  130. package/build/scripts/test-image-analysis.js +50 -0
  131. package/build/scripts/test-ingestion-infra.js +39 -0
  132. package/build/scripts/test-install.js +40 -0
  133. package/build/scripts/test-institutional.js +26 -0
  134. package/build/scripts/test-integrity.js +41 -0
  135. package/build/scripts/test-jit.js +42 -0
  136. package/build/scripts/test-job-queue.js +62 -0
  137. package/build/scripts/test-kaggle-download.js +34 -0
  138. package/build/scripts/test-large-data.js +50 -0
  139. package/build/scripts/test-mcp-v5.js +74 -0
  140. package/build/scripts/test-media-analysis.js +61 -0
  141. package/build/scripts/test-monitoring.js +91 -0
  142. package/build/scripts/test-observability.js +106 -0
  143. package/build/scripts/test-packager.js +55 -0
  144. package/build/scripts/test-pipeline.js +50 -0
  145. package/build/scripts/test-planning.js +64 -0
  146. package/build/scripts/test-privacy.js +38 -0
  147. package/build/scripts/test-production-sync.js +36 -0
  148. package/build/scripts/test-quality.js +43 -0
  149. package/build/scripts/test-robust-ingestion.js +41 -0
  150. package/build/scripts/test-schema.js +45 -0
  151. package/build/scripts/test-split-validation.js +40 -0
  152. package/build/scripts/test-splitter.js +93 -0
  153. package/build/scripts/test-target-detector.js +29 -0
  154. package/build/scripts/test-uci.js +27 -0
  155. package/build/scripts/test-unified-quality.js +86 -0
  156. package/build/scripts/test-write.js +14 -0
  157. package/build/scripts/verify-integration.js +57 -0
  158. package/build/scripts/verify-priority.js +33 -0
  159. package/build/search/embedder.js +34 -0
  160. package/build/search/engine.js +152 -0
  161. package/build/search/jit-orchestrator.js +258 -0
  162. package/build/search/vector-store.js +123 -0
  163. package/build/splitting/splitter.js +82 -0
  164. package/build/splitting/types.js +1 -0
  165. package/build/tools/formatter.js +251 -0
  166. package/build/utils/downloader.js +52 -0
  167. package/build/utils/selector.js +69 -0
  168. package/mcp-config-template.json +18 -0
  169. package/package.json +101 -29
  170. package/scripts/postinstall.cjs +114 -0
  171. package/scripts/preindex_registry.cjs +157 -0
  172. package/scripts/refresh-index.cjs +87 -0
  173. package/{wizard.js → scripts/wizard.js} +148 -32
  174. package/src/python/__pycache__/config.cpython-312.pyc +0 -0
  175. package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
  176. package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
  177. package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
  178. package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
  179. package/src/python/asset_downloader_engine.py +92 -0
  180. package/src/python/cleaner.py +226 -0
  181. package/src/python/config.py +263 -0
  182. package/src/python/dataworld_engine.py +208 -0
  183. package/src/python/export_engine.py +243 -0
  184. package/src/python/framework_adapters.py +100 -0
  185. package/src/python/fusion_engine.py +368 -0
  186. package/src/python/github_adapter.py +106 -0
  187. package/src/python/hf_fallback.py +298 -0
  188. package/src/python/image_engine.py +86 -0
  189. package/src/python/kaggle_engine.py +295 -0
  190. package/src/python/media_engine.py +133 -0
  191. package/src/python/nasa_adapter.py +82 -0
  192. package/src/python/openml_engine.py +146 -0
  193. package/src/python/quality_engine.py +267 -0
  194. package/src/python/row_count.py +54 -0
  195. package/src/python/splitter_engine.py +283 -0
  196. package/src/python/target_engine.py +154 -0
  197. package/src/python/test_framework_adapters.py +61 -0
  198. package/src/python/test_fusion_engine.py +89 -0
  199. package/src/python/uci_adapter.py +94 -0
  200. package/src/python/vesper/__init__.py +1 -0
  201. package/src/python/vesper/core/__init__.py +1 -0
  202. package/src/python/vesper/core/asset_downloader.py +675 -0
  203. package/src/python/vesper/core/download_recipe.py +104 -0
  204. package/src/python/worldbank_adapter.py +99 -0
  205. package/vesper-mcp-config.json +0 -6
@@ -0,0 +1,157 @@
1
+ const fs = require('fs');
2
+ const path = require('path');
3
+ const os = require('os');
4
+
5
+ const { argv, cwd } = process;
6
+
7
+ function usage() {
8
+ console.log(`Usage: node scripts/preindex_registry.cjs [--scan dir1 dir2 ...] [--target N] [--out path] [--no-count]
9
+
10
+ Options:
11
+ --scan Directories to recursively scan for datasets (default: ./e2e_demo_output ./datasets)
12
+ --target Target total registry entries (if larger than scanned, will synthesize entries)
13
+ --out Output registry path (default: ~/.vesper/registry.json)
14
+ --no-count Skip expensive row counting for CSV/JSONL
15
+ `);
16
+ }
17
+
18
+ let scanDirs = [];
19
+ let target = 0;
20
+ let outPath = path.join(os.homedir(), '.vesper', 'registry.json');
21
+ let doCount = true;
22
+
23
+ for (let i = 2; i < argv.length; i++) {
24
+ const a = argv[i];
25
+ if (a === '--scan') {
26
+ i++;
27
+ while (i < argv.length && !argv[i].startsWith('--')) {
28
+ scanDirs.push(argv[i]);
29
+ i++;
30
+ }
31
+ i--;
32
+ } else if (a === '--target') {
33
+ target = parseInt(argv[++i], 10) || 0;
34
+ } else if (a === '--out') {
35
+ outPath = path.resolve(argv[++i]);
36
+ } else if (a === '--no-count') {
37
+ doCount = false;
38
+ } else if (a === '--help' || a === '-h') {
39
+ usage();
40
+ process.exit(0);
41
+ } else {
42
+ console.error('Unknown arg', a);
43
+ usage();
44
+ process.exit(2);
45
+ }
46
+ }
47
+
48
+ if (scanDirs.length === 0) scanDirs = [path.join(cwd(), 'e2e_demo_output'), path.join(cwd(), 'datasets')];
49
+
50
+ function normalizeId(s) {
51
+ return s.replace(/[^a-z0-9]+/gi, '_').replace(/^_+|_+$/g, '').toLowerCase();
52
+ }
53
+
54
+ function walk(dir, exts = ['.csv', '.jsonl', '.json', '.arrow', '.parquet', '.feather']) {
55
+ const results = [];
56
+ try {
57
+ const items = fs.readdirSync(dir, { withFileTypes: true });
58
+ for (const it of items) {
59
+ const p = path.join(dir, it.name);
60
+ if (it.isDirectory()) results.push(...walk(p, exts));
61
+ else if (it.isFile()) {
62
+ const ext = path.extname(it.name).toLowerCase();
63
+ if (exts.includes(ext)) results.push(p);
64
+ }
65
+ }
66
+ } catch (e) {
67
+ // ignore
68
+ }
69
+ return results;
70
+ }
71
+
72
+ function countCsvRows(filePath) {
73
+ return new Promise((resolve, reject) => {
74
+ let count = 0;
75
+ const rs = fs.createReadStream(filePath, { encoding: 'utf8' });
76
+ rs.on('data', chunk => {
77
+ for (let i = 0; i < chunk.length; i++) if (chunk[i] === '\n') count++;
78
+ });
79
+ rs.on('end', () => resolve(count));
80
+ rs.on('error', reject);
81
+ });
82
+ }
83
+
84
+ (async function main() {
85
+ const registryDir = path.dirname(outPath);
86
+ if (!fs.existsSync(registryDir)) fs.mkdirSync(registryDir, { recursive: true });
87
+
88
+ let existing = [];
89
+ if (fs.existsSync(outPath)) {
90
+ try { existing = JSON.parse(fs.readFileSync(outPath, 'utf8')); } catch (e) { existing = []; }
91
+ }
92
+ const map = new Map();
93
+ for (const e of existing) map.set(e.normalized_id || e.id, e);
94
+
95
+ let scanned = 0;
96
+ for (const dir of scanDirs) {
97
+ const abs = path.resolve(dir);
98
+ const files = walk(abs);
99
+ for (const f of files) {
100
+ const stats = fs.statSync(f);
101
+ const base = path.basename(f, path.extname(f));
102
+ const rel = path.relative(process.cwd(), f);
103
+ const id = normalizeId(rel || base);
104
+ let cols = null;
105
+ let rows = null;
106
+ if (doCount && (f.endsWith('.csv') || f.endsWith('.jsonl') || f.endsWith('.json'))) {
107
+ try {
108
+ if (f.endsWith('.csv')) {
109
+ const header = fs.readFileSync(f, { encoding: 'utf8', flag: 'r' }).split(/\r?\n/, 1)[0] || '';
110
+ cols = header ? header.split(',').length : 0;
111
+ rows = await countCsvRows(f);
112
+ } else if (f.endsWith('.jsonl')) {
113
+ rows = await countCsvRows(f);
114
+ }
115
+ } catch (e) {
116
+ // ignore
117
+ }
118
+ }
119
+ const entry = {
120
+ id: id,
121
+ normalized_id: id,
122
+ source: 'scanned',
123
+ path: f,
124
+ size: stats.size,
125
+ mtime: stats.mtime.toISOString(),
126
+ meta: { rows, cols }
127
+ };
128
+ map.set(id, entry);
129
+ scanned++;
130
+ }
131
+ }
132
+
133
+ // Synthesize if target requested
134
+ if (target > map.size) {
135
+ const synthCount = target - map.size;
136
+ const synthDir = path.join(path.dirname(outPath), 'local_library');
137
+ if (!fs.existsSync(synthDir)) fs.mkdirSync(synthDir, { recursive: true });
138
+ for (let i = 1; i <= synthCount; i++) {
139
+ const idx = map.size + i;
140
+ const id = `synth_${String(idx).padStart(6, '0')}`;
141
+ const entry = {
142
+ id,
143
+ normalized_id: id,
144
+ source: 'synthesized',
145
+ path: path.join(synthDir, `${id}.csv`),
146
+ size: 0,
147
+ mtime: new Date().toISOString(),
148
+ meta: { rows: Math.floor(Math.random() * 1000000), cols: Math.floor(Math.random() * 200) + 1 }
149
+ };
150
+ map.set(id, entry);
151
+ }
152
+ }
153
+
154
+ const outArr = Array.from(map.values());
155
+ fs.writeFileSync(outPath, JSON.stringify(outArr, null, 2), 'utf8');
156
+ console.log(`Wrote ${outArr.length} registry entries to ${outPath} (${scanned} scanned, ${Math.max(0, outArr.length - scanned)} synthesized)`);
157
+ })();
@@ -0,0 +1,87 @@
1
+ #!/usr/bin/env node
2
+
3
+ const { spawnSync } = require("child_process");
4
+ const fs = require("fs");
5
+ const path = require("path");
6
+ const os = require("os");
7
+ const Database = require("better-sqlite3");
8
+
9
+ function runCommand(command, args, options = {}) {
10
+ const result = spawnSync(command, args, {
11
+ stdio: "inherit",
12
+ shell: process.platform === "win32",
13
+ ...options,
14
+ });
15
+
16
+ if (result.status !== 0) {
17
+ throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
18
+ }
19
+ }
20
+
21
+ function countDatasets(dbPath) {
22
+ if (!fs.existsSync(dbPath)) return "N/A";
23
+ const db = new Database(dbPath);
24
+ const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
25
+ db.close();
26
+ return count;
27
+ }
28
+
29
+ function countVectors(jsonPath) {
30
+ if (!fs.existsSync(jsonPath)) return "N/A";
31
+ const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
32
+ if (typeof data.count === "number") return data.count;
33
+ if (Array.isArray(data.ids)) return data.ids.length;
34
+ return "N/A";
35
+ }
36
+
37
+ function syncRuntime(workspaceRoot) {
38
+ const runtimeDir = path.join(os.homedir(), ".vesper", "data");
39
+ fs.mkdirSync(runtimeDir, { recursive: true });
40
+
41
+ const files = ["metadata.db", "vectors.json", "vectors.bin"];
42
+ for (const file of files) {
43
+ const src = path.join(workspaceRoot, "data", file);
44
+ const dest = path.join(runtimeDir, file);
45
+ if (!fs.existsSync(src)) {
46
+ throw new Error(`Missing source file: ${src}`);
47
+ }
48
+ fs.copyFileSync(src, dest);
49
+ }
50
+
51
+ return runtimeDir;
52
+ }
53
+
54
+ function main() {
55
+ const workspaceRoot = process.cwd();
56
+ const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
57
+ const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
58
+ const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
59
+ const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
60
+
61
+ console.log("\n[refresh-index] Step 1/3: Massive scrape...");
62
+ runCommand("npm", ["run", "massive-scrape"]);
63
+
64
+ console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
65
+ const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
66
+ runCommand("npm", ["run", "index"], { env });
67
+
68
+ console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
69
+ const runtimeDir = syncRuntime(workspaceRoot);
70
+
71
+ const wsDb = countDatasets(workspaceDbPath);
72
+ const wsVec = countVectors(workspaceVecPath);
73
+ const rtDb = countDatasets(runtimeDbPath);
74
+ const rtVec = countVectors(runtimeVecPath);
75
+
76
+ console.log("\n[refresh-index] Completed successfully.");
77
+ console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
78
+ console.log(`[refresh-index] Runtime: DB=${rtDb}, VECTORS=${rtVec}`);
79
+ console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
80
+ }
81
+
82
+ try {
83
+ main();
84
+ } catch (error) {
85
+ console.error("\n[refresh-index] Failed:", error.message);
86
+ process.exit(1);
87
+ }
@@ -91,12 +91,34 @@ function httpJson(method, url, body) {
91
91
  });
92
92
  }
93
93
 
94
- async function canReachDeviceAuth(baseUrl) {
94
+ async function probeDeviceAuth(baseUrl) {
95
95
  try {
96
96
  const res = await httpJson('POST', `${baseUrl}/api/auth/device/start`);
97
- return res.status === 201 && !!res.body && !!res.body.code;
98
- } catch {
99
- return false;
97
+ if (res.status === 201 && !!res.body && !!res.body.code) {
98
+ return { baseUrl, status: 'ready', response: res.body };
99
+ }
100
+
101
+ if (res.status === 503 && res.body && res.body.requiresSetup) {
102
+ return {
103
+ baseUrl,
104
+ status: 'setup-required',
105
+ response: res.body,
106
+ message: res.body.error || 'Auth storage is not initialized.',
107
+ };
108
+ }
109
+
110
+ return {
111
+ baseUrl,
112
+ status: 'unreachable',
113
+ response: res.body,
114
+ message: typeof res.body === 'string' ? res.body : JSON.stringify(res.body),
115
+ };
116
+ } catch (error) {
117
+ return {
118
+ baseUrl,
119
+ status: 'unreachable',
120
+ message: error && error.message ? error.message : 'Request failed',
121
+ };
100
122
  }
101
123
  }
102
124
 
@@ -105,13 +127,20 @@ async function resolveVesperApiBaseUrl() {
105
127
  ? [VESPER_API_URL]
106
128
  : DEFAULT_VESPER_API_CANDIDATES;
107
129
 
130
+ let setupRequiredProbe = null;
131
+
108
132
  for (const candidate of candidates) {
109
- if (await canReachDeviceAuth(candidate)) {
110
- return candidate;
133
+ const probe = await probeDeviceAuth(candidate);
134
+ if (probe.status === 'ready') {
135
+ return probe;
136
+ }
137
+
138
+ if (!setupRequiredProbe && probe.status === 'setup-required') {
139
+ setupRequiredProbe = probe;
111
140
  }
112
141
  }
113
142
 
114
- return null;
143
+ return setupRequiredProbe;
115
144
  }
116
145
 
117
146
  function openBrowser(url) {
@@ -136,6 +165,72 @@ function askYesNo(question) {
136
165
  });
137
166
  }
138
167
 
168
+ function askInput(question) {
169
+ return new Promise((resolve) => {
170
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
171
+ rl.question(` ${question} `, (answer) => {
172
+ rl.close();
173
+ resolve(String(answer || '').trim());
174
+ });
175
+ });
176
+ }
177
+
178
+ async function askChoice(question, choices, defaultValue) {
179
+ console.log(` ${question}`);
180
+ choices.forEach((choice, index) => {
181
+ console.log(` ${dim(String(index + 1) + ')')} ${choice.label}`);
182
+ });
183
+
184
+ const prompt = defaultValue ? `${dim('[default: ' + defaultValue + ']')}` : '';
185
+ const answer = await askInput(`${prompt} ${cyan('→')} Choose an option:`);
186
+ if (!answer && defaultValue) {
187
+ return defaultValue;
188
+ }
189
+
190
+ const numeric = Number(answer);
191
+ if (Number.isFinite(numeric) && numeric >= 1 && numeric <= choices.length) {
192
+ return choices[numeric - 1].value;
193
+ }
194
+
195
+ const matched = choices.find((choice) => choice.value === answer);
196
+ return matched ? matched.value : defaultValue;
197
+ }
198
+
199
+ function isCloudApiKey(value) {
200
+ return !!value && value.startsWith('vesper_sk_') && !value.startsWith('vesper_sk_local_');
201
+ }
202
+
203
+ async function promptForManualApiKey() {
204
+ console.log(`\n ${cyan('■')} ${bold('Manual API Key')}`);
205
+ console.log(` ${dim('Paste a Vesper cloud API key. It will be stored locally in config.toml.\n')}`);
206
+
207
+ while (true) {
208
+ const value = await askInput(`${cyan('→')} Vesper API key:`);
209
+ if (isCloudApiKey(value)) {
210
+ return value;
211
+ }
212
+ console.log(` ${yellow('!')} ${yellow('Expected a Vesper key starting with vesper_sk_')}`);
213
+ }
214
+ }
215
+
216
+ async function chooseAuthMode(existingKey, existingAuthMode) {
217
+ const hasExistingKey = !!existingKey;
218
+ if (hasExistingKey) {
219
+ console.log(` ${dim('Current key:')} ${dim(existingKey.slice(0, 24) + '...')}`);
220
+ console.log(` ${dim('Current mode:')} ${dim(existingAuthMode || (isCloudApiKey(existingKey) ? 'cloud' : 'local_unified'))}`);
221
+ }
222
+
223
+ const choices = [];
224
+ if (hasExistingKey) {
225
+ choices.push({ value: 'keep', label: 'Keep current key as-is' });
226
+ }
227
+ choices.push({ value: 'manual', label: 'Provide Vesper API key manually' });
228
+ choices.push({ value: 'browser', label: 'Sign in through the browser' });
229
+ choices.push({ value: 'local', label: 'Use local-only key' });
230
+
231
+ return await askChoice(`${cyan('→')} How do you want to authenticate Vesper?`, choices, hasExistingKey ? 'keep' : 'browser');
232
+ }
233
+
139
234
  async function deviceAuthFlow() {
140
235
  console.log(`\n ${cyan('■')} ${bold('Device Authentication')}`);
141
236
  console.log(` ${dim('Link your CLI to a Vesper account for cloud features\n')}`);
@@ -149,16 +244,25 @@ async function deviceAuthFlow() {
149
244
  return null;
150
245
  }
151
246
 
152
- console.log(` ${dim('Auth endpoint:')} ${dim(resolvedApiBaseUrl)}\n`);
247
+ if (resolvedApiBaseUrl.status === 'setup-required') {
248
+ console.log(` ${yellow('!')} ${yellow('Reached Vesper auth endpoint, but local auth storage is not initialized.')}`);
249
+ console.log(` ${dim('Endpoint:')} ${dim(resolvedApiBaseUrl.baseUrl)}`);
250
+ console.log(` ${dim('Reason:')} ${dim(resolvedApiBaseUrl.message || 'Apply Supabase migrations first.')}`);
251
+ console.log(` ${dim('Run the SQL in supabase/migrations/001_device_auth.sql and 002_rate_limits.sql, then retry.')}`);
252
+ console.log(` ${dim('Falling back to local-only mode.\n')}`);
253
+ return null;
254
+ }
255
+
256
+ console.log(` ${dim('Auth endpoint:')} ${dim(resolvedApiBaseUrl.baseUrl)}\n`);
153
257
 
154
258
  // Step 1: Call /api/auth/device/start
155
259
  process.stdout.write(` ${dim('Requesting device code...')}`);
156
260
  let startRes;
157
261
  try {
158
- startRes = await httpJson('POST', `${resolvedApiBaseUrl}/api/auth/device/start`);
262
+ startRes = await httpJson('POST', `${resolvedApiBaseUrl.baseUrl}/api/auth/device/start`);
159
263
  } catch (err) {
160
264
  console.log(` ${red('✗')}`);
161
- console.log(` ${red('Could not reach Vesper API at')} ${dim(resolvedApiBaseUrl)}`);
265
+ console.log(` ${red('Could not reach Vesper API at')} ${dim(resolvedApiBaseUrl.baseUrl)}`);
162
266
  console.log(` ${dim('Falling back to local-only mode.\n')}`);
163
267
  return null;
164
268
  }
@@ -198,7 +302,7 @@ async function deviceAuthFlow() {
198
302
  process.stdout.write(`\r ${cyan(frame)} Polling... (${polls})`);
199
303
 
200
304
  try {
201
- const pollRes = await httpJson('GET', `${resolvedApiBaseUrl}/api/auth/device/poll?code=${code}`);
305
+ const pollRes = await httpJson('GET', `${resolvedApiBaseUrl.baseUrl}/api/auth/device/poll?code=${code}`);
202
306
 
203
307
  if (pollRes.body.status === 'confirmed' && pollRes.body.apiKey) {
204
308
  process.stdout.write(`\r ${green('✓')} Device authenticated! \n`);
@@ -357,36 +461,48 @@ async function main() {
357
461
  let localKey = existing.api_key || '';
358
462
  let authMode = existing.auth_mode || '';
359
463
 
360
- // If already has a cloud key, skip
361
- if (localKey && !localKey.startsWith('vesper_sk_local_')) {
362
- console.log(` ${green('')} Cloud API key already configured`);
363
- console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 24) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
364
- } else {
365
- // Offer device auth
366
- const wantsDevice = await askYesNo(`${cyan('')} Link to a Vesper account? (enables cloud sync & team features)`);
464
+ const authChoice = await chooseAuthMode(localKey, authMode);
465
+
466
+ if (authChoice === 'keep' && localKey) {
467
+ console.log(` ${green('')} Keeping current key`);
468
+ } else if (authChoice === 'manual') {
469
+ localKey = await promptForManualApiKey();
470
+ authMode = 'cloud';
471
+ console.log(` ${green('✓')} Cloud API key saved from manual input`);
472
+ } else if (authChoice === 'browser') {
473
+ const cloudKey = await deviceAuthFlow();
474
+ if (cloudKey) {
475
+ localKey = cloudKey;
476
+ authMode = 'cloud';
477
+ } else {
478
+ const fallbackChoice = await askChoice(`${yellow('!')} Browser sign-in did not complete. Choose a fallback:`, [
479
+ { value: 'manual', label: 'Provide Vesper API key manually' },
480
+ { value: 'local', label: 'Use local-only key' },
481
+ ], 'manual');
367
482
 
368
- if (wantsDevice) {
369
- const cloudKey = await deviceAuthFlow();
370
- if (cloudKey) {
371
- localKey = cloudKey;
483
+ if (fallbackChoice === 'manual') {
484
+ localKey = await promptForManualApiKey();
372
485
  authMode = 'cloud';
373
486
  } else {
374
- // Fall back to local key
375
- if (!localKey) localKey = generateLocalKey();
487
+ if (!localKey || isCloudApiKey(localKey)) {
488
+ localKey = generateLocalKey();
489
+ }
376
490
  authMode = 'local_unified';
377
491
  console.log(`\n ${yellow('⚠')} Using local-only key. Run the wizard again anytime to link an account.`);
378
492
  }
379
- } else {
380
- if (!localKey) localKey = generateLocalKey();
381
- authMode = 'local_unified';
382
- console.log(` ${green('✓')} Local-only key generated`);
383
493
  }
384
-
385
- const configData = { ...existing, api_key: localKey, auth_mode: authMode };
386
- writeToml(CONFIG_TOML, configData);
387
- console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 24) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
494
+ } else {
495
+ if (!localKey || isCloudApiKey(localKey)) {
496
+ localKey = generateLocalKey();
497
+ }
498
+ authMode = 'local_unified';
499
+ console.log(` ${green('✓')} Local-only key ready`);
388
500
  }
389
501
 
502
+ const configData = { ...existing, api_key: localKey, auth_mode: authMode };
503
+ writeToml(CONFIG_TOML, configData);
504
+ console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 24) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
505
+
390
506
  // ─── Step 3: Local vault initialization ────────────────────
391
507
  process.stdout.write(`\n ${dim('[')}${cyan('3/6')}${dim(']')} Initializing local credentials vault...`);
392
508
  const vaultData = readToml(CONFIG_TOML);
@@ -0,0 +1,92 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import sys
6
+ import warnings
7
+ from pathlib import Path
8
+ from typing import Any, Dict
9
+
10
+ # Suppress noisy HF warnings
11
+ warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
12
+ warnings.filterwarnings("ignore", message=".*legacy.*")
13
+
14
+ CURRENT_DIR = Path(__file__).resolve().parent
15
+ if str(CURRENT_DIR) not in sys.path:
16
+ sys.path.insert(0, str(CURRENT_DIR))
17
+
18
+ from vesper.core.asset_downloader import AssetDownloader
19
+ from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
20
+
21
+
22
+ def _print(payload: Dict[str, Any]) -> None:
23
+ print(json.dumps(payload, ensure_ascii=False))
24
+
25
+
26
+ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
27
+ payload = json.loads(args.payload)
28
+ output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
29
+ workers = int(payload.get("workers") or 8)
30
+ recipes_dir = payload.get("recipes_dir")
31
+
32
+ # Auto-set HF token from payload if provided
33
+ token = payload.get("token") or payload.get("hf_token")
34
+ if token:
35
+ os.environ["HF_TOKEN"] = str(token)
36
+
37
+ downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
38
+
39
+ result = await downloader.download_assets(
40
+ dataset_id=str(payload.get("dataset_id")),
41
+ source=payload.get("source"),
42
+ repo_id=payload.get("repo_id"),
43
+ kaggle_ref=payload.get("kaggle_ref"),
44
+ urls=payload.get("urls"),
45
+ output_format=payload.get("output_format", "webdataset"),
46
+ max_items=payload.get("max_items"),
47
+ image_column=payload.get("image_column"),
48
+ )
49
+ return {"ok": True, "result": result}
50
+
51
+
52
+ def main() -> None:
53
+ parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
54
+ parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
55
+ parser.add_argument("payload", help="JSON payload")
56
+ args = parser.parse_args()
57
+
58
+ try:
59
+ if args.action == "download":
60
+ response = asyncio.run(_run_download(args))
61
+ _print(response)
62
+ return
63
+
64
+ payload = json.loads(args.payload)
65
+ if args.action == "build_recipe":
66
+ recipe = build_download_recipe(payload)
67
+ saved = save_recipe(recipe, payload.get("recipes_dir"))
68
+ _print({"ok": True, "recipe": recipe, "saved_to": saved})
69
+ return
70
+
71
+ if args.action == "get_recipe":
72
+ dataset_id = str(payload.get("dataset_id"))
73
+ recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
74
+ _print({"ok": True, "recipe": recipe})
75
+ return
76
+
77
+ _print({"ok": False, "error": f"Unknown action: {args.action}"})
78
+ except Exception as e:
79
+ error_msg = str(e)
80
+ # Provide actionable error messages
81
+ if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
82
+ error_msg = (
83
+ "Authentication required. This dataset may be gated/private. "
84
+ "Use configure_keys tool to set HF_TOKEN, then retry."
85
+ )
86
+ elif "No image column" in error_msg:
87
+ error_msg += " Hint: specify image_column parameter with the name of the column containing images."
88
+ _print({"ok": False, "error": error_msg})
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()