vesper-wizard 2.0.4 → 2.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +300 -37
- package/build/cache/cdn.js +34 -0
- package/build/cache/service.js +63 -0
- package/build/cleaning/cleaner.js +81 -0
- package/build/cleaning/evaluator.js +89 -0
- package/build/cleaning/executor.js +62 -0
- package/build/cleaning/exporter.js +87 -0
- package/build/cleaning/planner.js +127 -0
- package/build/cleaning/rules.js +57 -0
- package/build/cleaning/types.js +1 -0
- package/build/cloud/adapters/local.js +37 -0
- package/build/cloud/adapters/s3.js +24 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +26 -0
- package/build/cloud/types.js +1 -0
- package/build/compliance/service.js +73 -0
- package/build/compliance/store.js +80 -0
- package/build/compliance/types.js +1 -0
- package/build/config/config-manager.js +221 -0
- package/build/config/secure-keys.js +51 -0
- package/build/config/user-config.js +48 -0
- package/build/data/processing-worker.js +23 -0
- package/build/data/streaming.js +38 -0
- package/build/data/worker-pool.js +39 -0
- package/build/export/exporter.js +69 -0
- package/build/export/packager.js +100 -0
- package/build/export/types.js +1 -0
- package/build/fusion/aligner.js +56 -0
- package/build/fusion/deduplicator.js +69 -0
- package/build/fusion/engine.js +69 -0
- package/build/fusion/harmonizer.js +39 -0
- package/build/fusion/orchestrator.js +86 -0
- package/build/fusion/types.js +1 -0
- package/build/gateway/unified-dataset-gateway.js +409 -0
- package/build/index.js +2704 -0
- package/build/ingestion/hf-downloader.js +171 -0
- package/build/ingestion/ingestor.js +271 -0
- package/build/ingestion/kaggle-downloader.js +102 -0
- package/build/install/install-service.js +41 -0
- package/build/jobs/manager.js +136 -0
- package/build/jobs/queue.js +59 -0
- package/build/jobs/types.js +1 -0
- package/build/lib/supabase.js +3 -0
- package/build/metadata/dataworld-source.js +89 -0
- package/build/metadata/domain.js +147 -0
- package/build/metadata/github-scraper.js +47 -0
- package/build/metadata/institutional-scrapers.js +49 -0
- package/build/metadata/kaggle-scraper.js +182 -0
- package/build/metadata/kaggle-source.js +70 -0
- package/build/metadata/license.js +68 -0
- package/build/metadata/monitoring-service.js +107 -0
- package/build/metadata/monitoring-store.js +78 -0
- package/build/metadata/monitoring-types.js +1 -0
- package/build/metadata/openml-source.js +87 -0
- package/build/metadata/quality.js +48 -0
- package/build/metadata/rate-limiter.js +128 -0
- package/build/metadata/scraper.js +377 -0
- package/build/metadata/store.js +340 -0
- package/build/metadata/types.js +1 -0
- package/build/metadata/uci-scraper.js +49 -0
- package/build/monitoring/observability.js +76 -0
- package/build/preparation/target-detector.js +75 -0
- package/build/python/__pycache__/config.cpython-312.pyc +0 -0
- package/build/python/asset_downloader_engine.py +92 -0
- package/build/python/cleaner.py +226 -0
- package/build/python/config.py +263 -0
- package/build/python/dataworld_engine.py +208 -0
- package/build/python/export_engine.py +243 -0
- package/build/python/framework_adapters.py +100 -0
- package/build/python/fusion_engine.py +368 -0
- package/build/python/github_adapter.py +106 -0
- package/build/python/hf_fallback.py +298 -0
- package/build/python/image_engine.py +86 -0
- package/build/python/kaggle_engine.py +295 -0
- package/build/python/media_engine.py +133 -0
- package/build/python/nasa_adapter.py +82 -0
- package/build/python/openml_engine.py +146 -0
- package/build/python/quality_engine.py +267 -0
- package/build/python/row_count.py +54 -0
- package/build/python/splitter_engine.py +283 -0
- package/build/python/target_engine.py +154 -0
- package/build/python/test_framework_adapters.py +61 -0
- package/build/python/test_fusion_engine.py +89 -0
- package/build/python/uci_adapter.py +94 -0
- package/build/python/vesper/__init__.py +1 -0
- package/build/python/vesper/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__init__.py +1 -0
- package/build/python/vesper/core/__pycache__/__init__.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/asset_downloader.cpython-312.pyc +0 -0
- package/build/python/vesper/core/__pycache__/download_recipe.cpython-312.pyc +0 -0
- package/build/python/vesper/core/asset_downloader.py +675 -0
- package/build/python/vesper/core/download_recipe.py +104 -0
- package/build/python/worldbank_adapter.py +99 -0
- package/build/quality/analyzer.js +93 -0
- package/build/quality/image-analyzer.js +114 -0
- package/build/quality/media-analyzer.js +115 -0
- package/build/quality/quality-orchestrator.js +162 -0
- package/build/quality/types.js +1 -0
- package/build/scripts/build-index.js +54 -0
- package/build/scripts/check-db.js +73 -0
- package/build/scripts/check-jobs.js +24 -0
- package/build/scripts/check-naruto.js +17 -0
- package/build/scripts/cleanup-kaggle.js +41 -0
- package/build/scripts/demo-full-pipeline.js +62 -0
- package/build/scripts/demo-ui.js +58 -0
- package/build/scripts/e2e-demo.js +72 -0
- package/build/scripts/massive-scrape.js +103 -0
- package/build/scripts/ops-dashboard.js +33 -0
- package/build/scripts/repro-bug.js +37 -0
- package/build/scripts/repro-export-bug.js +56 -0
- package/build/scripts/scrape-metadata.js +100 -0
- package/build/scripts/search-cli.js +26 -0
- package/build/scripts/test-bias.js +45 -0
- package/build/scripts/test-caching.js +51 -0
- package/build/scripts/test-cleaning.js +76 -0
- package/build/scripts/test-cloud-storage.js +48 -0
- package/build/scripts/test-compliance.js +58 -0
- package/build/scripts/test-conversion.js +64 -0
- package/build/scripts/test-custom-rules.js +58 -0
- package/build/scripts/test-db-opt.js +63 -0
- package/build/scripts/test-export-custom.js +33 -0
- package/build/scripts/test-exporter.js +53 -0
- package/build/scripts/test-fusion.js +61 -0
- package/build/scripts/test-github.js +27 -0
- package/build/scripts/test-group-split.js +52 -0
- package/build/scripts/test-hf-download.js +29 -0
- package/build/scripts/test-holdout-manager.js +61 -0
- package/build/scripts/test-hybrid-search.js +41 -0
- package/build/scripts/test-image-analysis.js +50 -0
- package/build/scripts/test-ingestion-infra.js +39 -0
- package/build/scripts/test-install.js +40 -0
- package/build/scripts/test-institutional.js +26 -0
- package/build/scripts/test-integrity.js +41 -0
- package/build/scripts/test-jit.js +42 -0
- package/build/scripts/test-job-queue.js +62 -0
- package/build/scripts/test-kaggle-download.js +34 -0
- package/build/scripts/test-large-data.js +50 -0
- package/build/scripts/test-mcp-v5.js +74 -0
- package/build/scripts/test-media-analysis.js +61 -0
- package/build/scripts/test-monitoring.js +91 -0
- package/build/scripts/test-observability.js +106 -0
- package/build/scripts/test-packager.js +55 -0
- package/build/scripts/test-pipeline.js +50 -0
- package/build/scripts/test-planning.js +64 -0
- package/build/scripts/test-privacy.js +38 -0
- package/build/scripts/test-production-sync.js +36 -0
- package/build/scripts/test-quality.js +43 -0
- package/build/scripts/test-robust-ingestion.js +41 -0
- package/build/scripts/test-schema.js +45 -0
- package/build/scripts/test-split-validation.js +40 -0
- package/build/scripts/test-splitter.js +93 -0
- package/build/scripts/test-target-detector.js +29 -0
- package/build/scripts/test-uci.js +27 -0
- package/build/scripts/test-unified-quality.js +86 -0
- package/build/scripts/test-write.js +14 -0
- package/build/scripts/verify-integration.js +57 -0
- package/build/scripts/verify-priority.js +33 -0
- package/build/search/embedder.js +34 -0
- package/build/search/engine.js +152 -0
- package/build/search/jit-orchestrator.js +258 -0
- package/build/search/vector-store.js +123 -0
- package/build/splitting/splitter.js +82 -0
- package/build/splitting/types.js +1 -0
- package/build/tools/formatter.js +251 -0
- package/build/utils/downloader.js +52 -0
- package/build/utils/selector.js +69 -0
- package/mcp-config-template.json +18 -0
- package/package.json +101 -29
- package/scripts/postinstall.cjs +114 -0
- package/scripts/preindex_registry.cjs +157 -0
- package/scripts/refresh-index.cjs +87 -0
- package/{wizard.js → scripts/wizard.js} +148 -32
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/export_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/framework_adapters.cpython-312.pyc +0 -0
- package/src/python/__pycache__/fusion_engine.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +92 -0
- package/src/python/cleaner.py +226 -0
- package/src/python/config.py +263 -0
- package/src/python/dataworld_engine.py +208 -0
- package/src/python/export_engine.py +243 -0
- package/src/python/framework_adapters.py +100 -0
- package/src/python/fusion_engine.py +368 -0
- package/src/python/github_adapter.py +106 -0
- package/src/python/hf_fallback.py +298 -0
- package/src/python/image_engine.py +86 -0
- package/src/python/kaggle_engine.py +295 -0
- package/src/python/media_engine.py +133 -0
- package/src/python/nasa_adapter.py +82 -0
- package/src/python/openml_engine.py +146 -0
- package/src/python/quality_engine.py +267 -0
- package/src/python/row_count.py +54 -0
- package/src/python/splitter_engine.py +283 -0
- package/src/python/target_engine.py +154 -0
- package/src/python/test_framework_adapters.py +61 -0
- package/src/python/test_fusion_engine.py +89 -0
- package/src/python/uci_adapter.py +94 -0
- package/src/python/vesper/__init__.py +1 -0
- package/src/python/vesper/core/__init__.py +1 -0
- package/src/python/vesper/core/asset_downloader.py +675 -0
- package/src/python/vesper/core/download_recipe.py +104 -0
- package/src/python/worldbank_adapter.py +99 -0
- package/vesper-mcp-config.json +0 -6
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
const fs = require('fs');
|
|
2
|
+
const path = require('path');
|
|
3
|
+
const os = require('os');
|
|
4
|
+
|
|
5
|
+
const { argv, cwd } = process;
|
|
6
|
+
|
|
7
|
+
function usage() {
|
|
8
|
+
console.log(`Usage: node scripts/preindex_registry.cjs [--scan dir1 dir2 ...] [--target N] [--out path] [--no-count]
|
|
9
|
+
|
|
10
|
+
Options:
|
|
11
|
+
--scan Directories to recursively scan for datasets (default: ./e2e_demo_output ./datasets)
|
|
12
|
+
--target Target total registry entries (if larger than scanned, will synthesize entries)
|
|
13
|
+
--out Output registry path (default: ~/.vesper/registry.json)
|
|
14
|
+
--no-count Skip expensive row counting for CSV/JSONL
|
|
15
|
+
`);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
let scanDirs = [];
|
|
19
|
+
let target = 0;
|
|
20
|
+
let outPath = path.join(os.homedir(), '.vesper', 'registry.json');
|
|
21
|
+
let doCount = true;
|
|
22
|
+
|
|
23
|
+
for (let i = 2; i < argv.length; i++) {
|
|
24
|
+
const a = argv[i];
|
|
25
|
+
if (a === '--scan') {
|
|
26
|
+
i++;
|
|
27
|
+
while (i < argv.length && !argv[i].startsWith('--')) {
|
|
28
|
+
scanDirs.push(argv[i]);
|
|
29
|
+
i++;
|
|
30
|
+
}
|
|
31
|
+
i--;
|
|
32
|
+
} else if (a === '--target') {
|
|
33
|
+
target = parseInt(argv[++i], 10) || 0;
|
|
34
|
+
} else if (a === '--out') {
|
|
35
|
+
outPath = path.resolve(argv[++i]);
|
|
36
|
+
} else if (a === '--no-count') {
|
|
37
|
+
doCount = false;
|
|
38
|
+
} else if (a === '--help' || a === '-h') {
|
|
39
|
+
usage();
|
|
40
|
+
process.exit(0);
|
|
41
|
+
} else {
|
|
42
|
+
console.error('Unknown arg', a);
|
|
43
|
+
usage();
|
|
44
|
+
process.exit(2);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (scanDirs.length === 0) scanDirs = [path.join(cwd(), 'e2e_demo_output'), path.join(cwd(), 'datasets')];
|
|
49
|
+
|
|
50
|
+
function normalizeId(s) {
|
|
51
|
+
return s.replace(/[^a-z0-9]+/gi, '_').replace(/^_+|_+$/g, '').toLowerCase();
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function walk(dir, exts = ['.csv', '.jsonl', '.json', '.arrow', '.parquet', '.feather']) {
|
|
55
|
+
const results = [];
|
|
56
|
+
try {
|
|
57
|
+
const items = fs.readdirSync(dir, { withFileTypes: true });
|
|
58
|
+
for (const it of items) {
|
|
59
|
+
const p = path.join(dir, it.name);
|
|
60
|
+
if (it.isDirectory()) results.push(...walk(p, exts));
|
|
61
|
+
else if (it.isFile()) {
|
|
62
|
+
const ext = path.extname(it.name).toLowerCase();
|
|
63
|
+
if (exts.includes(ext)) results.push(p);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
} catch (e) {
|
|
67
|
+
// ignore
|
|
68
|
+
}
|
|
69
|
+
return results;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function countCsvRows(filePath) {
|
|
73
|
+
return new Promise((resolve, reject) => {
|
|
74
|
+
let count = 0;
|
|
75
|
+
const rs = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
76
|
+
rs.on('data', chunk => {
|
|
77
|
+
for (let i = 0; i < chunk.length; i++) if (chunk[i] === '\n') count++;
|
|
78
|
+
});
|
|
79
|
+
rs.on('end', () => resolve(count));
|
|
80
|
+
rs.on('error', reject);
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
(async function main() {
|
|
85
|
+
const registryDir = path.dirname(outPath);
|
|
86
|
+
if (!fs.existsSync(registryDir)) fs.mkdirSync(registryDir, { recursive: true });
|
|
87
|
+
|
|
88
|
+
let existing = [];
|
|
89
|
+
if (fs.existsSync(outPath)) {
|
|
90
|
+
try { existing = JSON.parse(fs.readFileSync(outPath, 'utf8')); } catch (e) { existing = []; }
|
|
91
|
+
}
|
|
92
|
+
const map = new Map();
|
|
93
|
+
for (const e of existing) map.set(e.normalized_id || e.id, e);
|
|
94
|
+
|
|
95
|
+
let scanned = 0;
|
|
96
|
+
for (const dir of scanDirs) {
|
|
97
|
+
const abs = path.resolve(dir);
|
|
98
|
+
const files = walk(abs);
|
|
99
|
+
for (const f of files) {
|
|
100
|
+
const stats = fs.statSync(f);
|
|
101
|
+
const base = path.basename(f, path.extname(f));
|
|
102
|
+
const rel = path.relative(process.cwd(), f);
|
|
103
|
+
const id = normalizeId(rel || base);
|
|
104
|
+
let cols = null;
|
|
105
|
+
let rows = null;
|
|
106
|
+
if (doCount && (f.endsWith('.csv') || f.endsWith('.jsonl') || f.endsWith('.json'))) {
|
|
107
|
+
try {
|
|
108
|
+
if (f.endsWith('.csv')) {
|
|
109
|
+
const header = fs.readFileSync(f, { encoding: 'utf8', flag: 'r' }).split(/\r?\n/, 1)[0] || '';
|
|
110
|
+
cols = header ? header.split(',').length : 0;
|
|
111
|
+
rows = await countCsvRows(f);
|
|
112
|
+
} else if (f.endsWith('.jsonl')) {
|
|
113
|
+
rows = await countCsvRows(f);
|
|
114
|
+
}
|
|
115
|
+
} catch (e) {
|
|
116
|
+
// ignore
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
const entry = {
|
|
120
|
+
id: id,
|
|
121
|
+
normalized_id: id,
|
|
122
|
+
source: 'scanned',
|
|
123
|
+
path: f,
|
|
124
|
+
size: stats.size,
|
|
125
|
+
mtime: stats.mtime.toISOString(),
|
|
126
|
+
meta: { rows, cols }
|
|
127
|
+
};
|
|
128
|
+
map.set(id, entry);
|
|
129
|
+
scanned++;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Synthesize if target requested
|
|
134
|
+
if (target > map.size) {
|
|
135
|
+
const synthCount = target - map.size;
|
|
136
|
+
const synthDir = path.join(path.dirname(outPath), 'local_library');
|
|
137
|
+
if (!fs.existsSync(synthDir)) fs.mkdirSync(synthDir, { recursive: true });
|
|
138
|
+
for (let i = 1; i <= synthCount; i++) {
|
|
139
|
+
const idx = map.size + i;
|
|
140
|
+
const id = `synth_${String(idx).padStart(6, '0')}`;
|
|
141
|
+
const entry = {
|
|
142
|
+
id,
|
|
143
|
+
normalized_id: id,
|
|
144
|
+
source: 'synthesized',
|
|
145
|
+
path: path.join(synthDir, `${id}.csv`),
|
|
146
|
+
size: 0,
|
|
147
|
+
mtime: new Date().toISOString(),
|
|
148
|
+
meta: { rows: Math.floor(Math.random() * 1000000), cols: Math.floor(Math.random() * 200) + 1 }
|
|
149
|
+
};
|
|
150
|
+
map.set(id, entry);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const outArr = Array.from(map.values());
|
|
155
|
+
fs.writeFileSync(outPath, JSON.stringify(outArr, null, 2), 'utf8');
|
|
156
|
+
console.log(`Wrote ${outArr.length} registry entries to ${outPath} (${scanned} scanned, ${Math.max(0, outArr.length - scanned)} synthesized)`);
|
|
157
|
+
})();
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const { spawnSync } = require("child_process");
|
|
4
|
+
const fs = require("fs");
|
|
5
|
+
const path = require("path");
|
|
6
|
+
const os = require("os");
|
|
7
|
+
const Database = require("better-sqlite3");
|
|
8
|
+
|
|
9
|
+
function runCommand(command, args, options = {}) {
|
|
10
|
+
const result = spawnSync(command, args, {
|
|
11
|
+
stdio: "inherit",
|
|
12
|
+
shell: process.platform === "win32",
|
|
13
|
+
...options,
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
if (result.status !== 0) {
|
|
17
|
+
throw new Error(`Command failed: ${command} ${args.join(" ")} (exit ${result.status})`);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function countDatasets(dbPath) {
|
|
22
|
+
if (!fs.existsSync(dbPath)) return "N/A";
|
|
23
|
+
const db = new Database(dbPath);
|
|
24
|
+
const count = db.prepare("SELECT COUNT(*) AS c FROM datasets").get().c;
|
|
25
|
+
db.close();
|
|
26
|
+
return count;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function countVectors(jsonPath) {
|
|
30
|
+
if (!fs.existsSync(jsonPath)) return "N/A";
|
|
31
|
+
const data = JSON.parse(fs.readFileSync(jsonPath, "utf8"));
|
|
32
|
+
if (typeof data.count === "number") return data.count;
|
|
33
|
+
if (Array.isArray(data.ids)) return data.ids.length;
|
|
34
|
+
return "N/A";
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function syncRuntime(workspaceRoot) {
|
|
38
|
+
const runtimeDir = path.join(os.homedir(), ".vesper", "data");
|
|
39
|
+
fs.mkdirSync(runtimeDir, { recursive: true });
|
|
40
|
+
|
|
41
|
+
const files = ["metadata.db", "vectors.json", "vectors.bin"];
|
|
42
|
+
for (const file of files) {
|
|
43
|
+
const src = path.join(workspaceRoot, "data", file);
|
|
44
|
+
const dest = path.join(runtimeDir, file);
|
|
45
|
+
if (!fs.existsSync(src)) {
|
|
46
|
+
throw new Error(`Missing source file: ${src}`);
|
|
47
|
+
}
|
|
48
|
+
fs.copyFileSync(src, dest);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return runtimeDir;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function main() {
|
|
55
|
+
const workspaceRoot = process.cwd();
|
|
56
|
+
const runtimeDbPath = path.join(os.homedir(), ".vesper", "data", "metadata.db");
|
|
57
|
+
const runtimeVecPath = path.join(os.homedir(), ".vesper", "data", "vectors.json");
|
|
58
|
+
const workspaceDbPath = path.join(workspaceRoot, "data", "metadata.db");
|
|
59
|
+
const workspaceVecPath = path.join(workspaceRoot, "data", "vectors.json");
|
|
60
|
+
|
|
61
|
+
console.log("\n[refresh-index] Step 1/3: Massive scrape...");
|
|
62
|
+
runCommand("npm", ["run", "massive-scrape"]);
|
|
63
|
+
|
|
64
|
+
console.log("\n[refresh-index] Step 2/3: High-memory indexing...");
|
|
65
|
+
const env = { ...process.env, NODE_OPTIONS: "--max-old-space-size=8192" };
|
|
66
|
+
runCommand("npm", ["run", "index"], { env });
|
|
67
|
+
|
|
68
|
+
console.log("\n[refresh-index] Step 3/3: Sync workspace index to runtime...");
|
|
69
|
+
const runtimeDir = syncRuntime(workspaceRoot);
|
|
70
|
+
|
|
71
|
+
const wsDb = countDatasets(workspaceDbPath);
|
|
72
|
+
const wsVec = countVectors(workspaceVecPath);
|
|
73
|
+
const rtDb = countDatasets(runtimeDbPath);
|
|
74
|
+
const rtVec = countVectors(runtimeVecPath);
|
|
75
|
+
|
|
76
|
+
console.log("\n[refresh-index] Completed successfully.");
|
|
77
|
+
console.log(`[refresh-index] Workspace: DB=${wsDb}, VECTORS=${wsVec}`);
|
|
78
|
+
console.log(`[refresh-index] Runtime: DB=${rtDb}, VECTORS=${rtVec}`);
|
|
79
|
+
console.log(`[refresh-index] Runtime path: ${runtimeDir}\n`);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
main();
|
|
84
|
+
} catch (error) {
|
|
85
|
+
console.error("\n[refresh-index] Failed:", error.message);
|
|
86
|
+
process.exit(1);
|
|
87
|
+
}
|
|
@@ -91,12 +91,34 @@ function httpJson(method, url, body) {
|
|
|
91
91
|
});
|
|
92
92
|
}
|
|
93
93
|
|
|
94
|
-
async function
|
|
94
|
+
async function probeDeviceAuth(baseUrl) {
|
|
95
95
|
try {
|
|
96
96
|
const res = await httpJson('POST', `${baseUrl}/api/auth/device/start`);
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
97
|
+
if (res.status === 201 && !!res.body && !!res.body.code) {
|
|
98
|
+
return { baseUrl, status: 'ready', response: res.body };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (res.status === 503 && res.body && res.body.requiresSetup) {
|
|
102
|
+
return {
|
|
103
|
+
baseUrl,
|
|
104
|
+
status: 'setup-required',
|
|
105
|
+
response: res.body,
|
|
106
|
+
message: res.body.error || 'Auth storage is not initialized.',
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
baseUrl,
|
|
112
|
+
status: 'unreachable',
|
|
113
|
+
response: res.body,
|
|
114
|
+
message: typeof res.body === 'string' ? res.body : JSON.stringify(res.body),
|
|
115
|
+
};
|
|
116
|
+
} catch (error) {
|
|
117
|
+
return {
|
|
118
|
+
baseUrl,
|
|
119
|
+
status: 'unreachable',
|
|
120
|
+
message: error && error.message ? error.message : 'Request failed',
|
|
121
|
+
};
|
|
100
122
|
}
|
|
101
123
|
}
|
|
102
124
|
|
|
@@ -105,13 +127,20 @@ async function resolveVesperApiBaseUrl() {
|
|
|
105
127
|
? [VESPER_API_URL]
|
|
106
128
|
: DEFAULT_VESPER_API_CANDIDATES;
|
|
107
129
|
|
|
130
|
+
let setupRequiredProbe = null;
|
|
131
|
+
|
|
108
132
|
for (const candidate of candidates) {
|
|
109
|
-
|
|
110
|
-
|
|
133
|
+
const probe = await probeDeviceAuth(candidate);
|
|
134
|
+
if (probe.status === 'ready') {
|
|
135
|
+
return probe;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (!setupRequiredProbe && probe.status === 'setup-required') {
|
|
139
|
+
setupRequiredProbe = probe;
|
|
111
140
|
}
|
|
112
141
|
}
|
|
113
142
|
|
|
114
|
-
return
|
|
143
|
+
return setupRequiredProbe;
|
|
115
144
|
}
|
|
116
145
|
|
|
117
146
|
function openBrowser(url) {
|
|
@@ -136,6 +165,72 @@ function askYesNo(question) {
|
|
|
136
165
|
});
|
|
137
166
|
}
|
|
138
167
|
|
|
168
|
+
function askInput(question) {
|
|
169
|
+
return new Promise((resolve) => {
|
|
170
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
171
|
+
rl.question(` ${question} `, (answer) => {
|
|
172
|
+
rl.close();
|
|
173
|
+
resolve(String(answer || '').trim());
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
async function askChoice(question, choices, defaultValue) {
|
|
179
|
+
console.log(` ${question}`);
|
|
180
|
+
choices.forEach((choice, index) => {
|
|
181
|
+
console.log(` ${dim(String(index + 1) + ')')} ${choice.label}`);
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
const prompt = defaultValue ? `${dim('[default: ' + defaultValue + ']')}` : '';
|
|
185
|
+
const answer = await askInput(`${prompt} ${cyan('→')} Choose an option:`);
|
|
186
|
+
if (!answer && defaultValue) {
|
|
187
|
+
return defaultValue;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const numeric = Number(answer);
|
|
191
|
+
if (Number.isFinite(numeric) && numeric >= 1 && numeric <= choices.length) {
|
|
192
|
+
return choices[numeric - 1].value;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const matched = choices.find((choice) => choice.value === answer);
|
|
196
|
+
return matched ? matched.value : defaultValue;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function isCloudApiKey(value) {
|
|
200
|
+
return !!value && value.startsWith('vesper_sk_') && !value.startsWith('vesper_sk_local_');
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
async function promptForManualApiKey() {
|
|
204
|
+
console.log(`\n ${cyan('■')} ${bold('Manual API Key')}`);
|
|
205
|
+
console.log(` ${dim('Paste a Vesper cloud API key. It will be stored locally in config.toml.\n')}`);
|
|
206
|
+
|
|
207
|
+
while (true) {
|
|
208
|
+
const value = await askInput(`${cyan('→')} Vesper API key:`);
|
|
209
|
+
if (isCloudApiKey(value)) {
|
|
210
|
+
return value;
|
|
211
|
+
}
|
|
212
|
+
console.log(` ${yellow('!')} ${yellow('Expected a Vesper key starting with vesper_sk_')}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
async function chooseAuthMode(existingKey, existingAuthMode) {
|
|
217
|
+
const hasExistingKey = !!existingKey;
|
|
218
|
+
if (hasExistingKey) {
|
|
219
|
+
console.log(` ${dim('Current key:')} ${dim(existingKey.slice(0, 24) + '...')}`);
|
|
220
|
+
console.log(` ${dim('Current mode:')} ${dim(existingAuthMode || (isCloudApiKey(existingKey) ? 'cloud' : 'local_unified'))}`);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const choices = [];
|
|
224
|
+
if (hasExistingKey) {
|
|
225
|
+
choices.push({ value: 'keep', label: 'Keep current key as-is' });
|
|
226
|
+
}
|
|
227
|
+
choices.push({ value: 'manual', label: 'Provide Vesper API key manually' });
|
|
228
|
+
choices.push({ value: 'browser', label: 'Sign in through the browser' });
|
|
229
|
+
choices.push({ value: 'local', label: 'Use local-only key' });
|
|
230
|
+
|
|
231
|
+
return await askChoice(`${cyan('→')} How do you want to authenticate Vesper?`, choices, hasExistingKey ? 'keep' : 'browser');
|
|
232
|
+
}
|
|
233
|
+
|
|
139
234
|
async function deviceAuthFlow() {
|
|
140
235
|
console.log(`\n ${cyan('■')} ${bold('Device Authentication')}`);
|
|
141
236
|
console.log(` ${dim('Link your CLI to a Vesper account for cloud features\n')}`);
|
|
@@ -149,16 +244,25 @@ async function deviceAuthFlow() {
|
|
|
149
244
|
return null;
|
|
150
245
|
}
|
|
151
246
|
|
|
152
|
-
|
|
247
|
+
if (resolvedApiBaseUrl.status === 'setup-required') {
|
|
248
|
+
console.log(` ${yellow('!')} ${yellow('Reached Vesper auth endpoint, but local auth storage is not initialized.')}`);
|
|
249
|
+
console.log(` ${dim('Endpoint:')} ${dim(resolvedApiBaseUrl.baseUrl)}`);
|
|
250
|
+
console.log(` ${dim('Reason:')} ${dim(resolvedApiBaseUrl.message || 'Apply Supabase migrations first.')}`);
|
|
251
|
+
console.log(` ${dim('Run the SQL in supabase/migrations/001_device_auth.sql and 002_rate_limits.sql, then retry.')}`);
|
|
252
|
+
console.log(` ${dim('Falling back to local-only mode.\n')}`);
|
|
253
|
+
return null;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
console.log(` ${dim('Auth endpoint:')} ${dim(resolvedApiBaseUrl.baseUrl)}\n`);
|
|
153
257
|
|
|
154
258
|
// Step 1: Call /api/auth/device/start
|
|
155
259
|
process.stdout.write(` ${dim('Requesting device code...')}`);
|
|
156
260
|
let startRes;
|
|
157
261
|
try {
|
|
158
|
-
startRes = await httpJson('POST', `${resolvedApiBaseUrl}/api/auth/device/start`);
|
|
262
|
+
startRes = await httpJson('POST', `${resolvedApiBaseUrl.baseUrl}/api/auth/device/start`);
|
|
159
263
|
} catch (err) {
|
|
160
264
|
console.log(` ${red('✗')}`);
|
|
161
|
-
console.log(` ${red('Could not reach Vesper API at')} ${dim(resolvedApiBaseUrl)}`);
|
|
265
|
+
console.log(` ${red('Could not reach Vesper API at')} ${dim(resolvedApiBaseUrl.baseUrl)}`);
|
|
162
266
|
console.log(` ${dim('Falling back to local-only mode.\n')}`);
|
|
163
267
|
return null;
|
|
164
268
|
}
|
|
@@ -198,7 +302,7 @@ async function deviceAuthFlow() {
|
|
|
198
302
|
process.stdout.write(`\r ${cyan(frame)} Polling... (${polls})`);
|
|
199
303
|
|
|
200
304
|
try {
|
|
201
|
-
const pollRes = await httpJson('GET', `${resolvedApiBaseUrl}/api/auth/device/poll?code=${code}`);
|
|
305
|
+
const pollRes = await httpJson('GET', `${resolvedApiBaseUrl.baseUrl}/api/auth/device/poll?code=${code}`);
|
|
202
306
|
|
|
203
307
|
if (pollRes.body.status === 'confirmed' && pollRes.body.apiKey) {
|
|
204
308
|
process.stdout.write(`\r ${green('✓')} Device authenticated! \n`);
|
|
@@ -357,36 +461,48 @@ async function main() {
|
|
|
357
461
|
let localKey = existing.api_key || '';
|
|
358
462
|
let authMode = existing.auth_mode || '';
|
|
359
463
|
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
console.log(` ${
|
|
364
|
-
} else {
|
|
365
|
-
|
|
366
|
-
|
|
464
|
+
const authChoice = await chooseAuthMode(localKey, authMode);
|
|
465
|
+
|
|
466
|
+
if (authChoice === 'keep' && localKey) {
|
|
467
|
+
console.log(` ${green('✓')} Keeping current key`);
|
|
468
|
+
} else if (authChoice === 'manual') {
|
|
469
|
+
localKey = await promptForManualApiKey();
|
|
470
|
+
authMode = 'cloud';
|
|
471
|
+
console.log(` ${green('✓')} Cloud API key saved from manual input`);
|
|
472
|
+
} else if (authChoice === 'browser') {
|
|
473
|
+
const cloudKey = await deviceAuthFlow();
|
|
474
|
+
if (cloudKey) {
|
|
475
|
+
localKey = cloudKey;
|
|
476
|
+
authMode = 'cloud';
|
|
477
|
+
} else {
|
|
478
|
+
const fallbackChoice = await askChoice(`${yellow('!')} Browser sign-in did not complete. Choose a fallback:`, [
|
|
479
|
+
{ value: 'manual', label: 'Provide Vesper API key manually' },
|
|
480
|
+
{ value: 'local', label: 'Use local-only key' },
|
|
481
|
+
], 'manual');
|
|
367
482
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
if (cloudKey) {
|
|
371
|
-
localKey = cloudKey;
|
|
483
|
+
if (fallbackChoice === 'manual') {
|
|
484
|
+
localKey = await promptForManualApiKey();
|
|
372
485
|
authMode = 'cloud';
|
|
373
486
|
} else {
|
|
374
|
-
|
|
375
|
-
|
|
487
|
+
if (!localKey || isCloudApiKey(localKey)) {
|
|
488
|
+
localKey = generateLocalKey();
|
|
489
|
+
}
|
|
376
490
|
authMode = 'local_unified';
|
|
377
491
|
console.log(`\n ${yellow('⚠')} Using local-only key. Run the wizard again anytime to link an account.`);
|
|
378
492
|
}
|
|
379
|
-
} else {
|
|
380
|
-
if (!localKey) localKey = generateLocalKey();
|
|
381
|
-
authMode = 'local_unified';
|
|
382
|
-
console.log(` ${green('✓')} Local-only key generated`);
|
|
383
493
|
}
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
494
|
+
} else {
|
|
495
|
+
if (!localKey || isCloudApiKey(localKey)) {
|
|
496
|
+
localKey = generateLocalKey();
|
|
497
|
+
}
|
|
498
|
+
authMode = 'local_unified';
|
|
499
|
+
console.log(` ${green('✓')} Local-only key ready`);
|
|
388
500
|
}
|
|
389
501
|
|
|
502
|
+
const configData = { ...existing, api_key: localKey, auth_mode: authMode };
|
|
503
|
+
writeToml(CONFIG_TOML, configData);
|
|
504
|
+
console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 24) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
|
|
505
|
+
|
|
390
506
|
// ─── Step 3: Local vault initialization ────────────────────
|
|
391
507
|
process.stdout.write(`\n ${dim('[')}${cyan('3/6')}${dim(']')} Initializing local credentials vault...`);
|
|
392
508
|
const vaultData = readToml(CONFIG_TOML);
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import asyncio
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
import warnings
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict
|
|
9
|
+
|
|
10
|
+
# Suppress noisy HF warnings
|
|
11
|
+
warnings.filterwarnings("ignore", message=".*trust_remote_code.*")
|
|
12
|
+
warnings.filterwarnings("ignore", message=".*legacy.*")
|
|
13
|
+
|
|
14
|
+
CURRENT_DIR = Path(__file__).resolve().parent
|
|
15
|
+
if str(CURRENT_DIR) not in sys.path:
|
|
16
|
+
sys.path.insert(0, str(CURRENT_DIR))
|
|
17
|
+
|
|
18
|
+
from vesper.core.asset_downloader import AssetDownloader
|
|
19
|
+
from vesper.core.download_recipe import build_download_recipe, save_recipe, get_download_recipe
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _print(payload: Dict[str, Any]) -> None:
|
|
23
|
+
print(json.dumps(payload, ensure_ascii=False))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
27
|
+
payload = json.loads(args.payload)
|
|
28
|
+
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
29
|
+
workers = int(payload.get("workers") or 8)
|
|
30
|
+
recipes_dir = payload.get("recipes_dir")
|
|
31
|
+
|
|
32
|
+
# Auto-set HF token from payload if provided
|
|
33
|
+
token = payload.get("token") or payload.get("hf_token")
|
|
34
|
+
if token:
|
|
35
|
+
os.environ["HF_TOKEN"] = str(token)
|
|
36
|
+
|
|
37
|
+
downloader = AssetDownloader(output_root=output_root, workers=workers, recipes_dir=recipes_dir)
|
|
38
|
+
|
|
39
|
+
result = await downloader.download_assets(
|
|
40
|
+
dataset_id=str(payload.get("dataset_id")),
|
|
41
|
+
source=payload.get("source"),
|
|
42
|
+
repo_id=payload.get("repo_id"),
|
|
43
|
+
kaggle_ref=payload.get("kaggle_ref"),
|
|
44
|
+
urls=payload.get("urls"),
|
|
45
|
+
output_format=payload.get("output_format", "webdataset"),
|
|
46
|
+
max_items=payload.get("max_items"),
|
|
47
|
+
image_column=payload.get("image_column"),
|
|
48
|
+
)
|
|
49
|
+
return {"ok": True, "result": result}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def main() -> None:
|
|
53
|
+
parser = argparse.ArgumentParser(description="Vesper Smart Asset Downloader Engine")
|
|
54
|
+
parser.add_argument("action", choices=["download", "build_recipe", "get_recipe"])
|
|
55
|
+
parser.add_argument("payload", help="JSON payload")
|
|
56
|
+
args = parser.parse_args()
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
if args.action == "download":
|
|
60
|
+
response = asyncio.run(_run_download(args))
|
|
61
|
+
_print(response)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
payload = json.loads(args.payload)
|
|
65
|
+
if args.action == "build_recipe":
|
|
66
|
+
recipe = build_download_recipe(payload)
|
|
67
|
+
saved = save_recipe(recipe, payload.get("recipes_dir"))
|
|
68
|
+
_print({"ok": True, "recipe": recipe, "saved_to": saved})
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
if args.action == "get_recipe":
|
|
72
|
+
dataset_id = str(payload.get("dataset_id"))
|
|
73
|
+
recipe = get_download_recipe(dataset_id, payload.get("recipes_dir"))
|
|
74
|
+
_print({"ok": True, "recipe": recipe})
|
|
75
|
+
return
|
|
76
|
+
|
|
77
|
+
_print({"ok": False, "error": f"Unknown action: {args.action}"})
|
|
78
|
+
except Exception as e:
|
|
79
|
+
error_msg = str(e)
|
|
80
|
+
# Provide actionable error messages
|
|
81
|
+
if "401" in error_msg or "403" in error_msg or "Unauthorized" in error_msg:
|
|
82
|
+
error_msg = (
|
|
83
|
+
"Authentication required. This dataset may be gated/private. "
|
|
84
|
+
"Use configure_keys tool to set HF_TOKEN, then retry."
|
|
85
|
+
)
|
|
86
|
+
elif "No image column" in error_msg:
|
|
87
|
+
error_msg += " Hint: specify image_column parameter with the name of the column containing images."
|
|
88
|
+
_print({"ok": False, "error": error_msg})
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|