@vespermcp/mcp-server 1.2.21 → 1.2.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/build/cloud/adapters/supabase.js +49 -0
- package/build/cloud/storage-manager.js +6 -0
- package/build/export/exporter.js +22 -9
- package/build/gateway/unified-dataset-gateway.js +410 -0
- package/build/index.js +1587 -845
- package/build/ingestion/ingestor.js +7 -4
- package/build/install/install-service.js +11 -6
- package/build/lib/supabase.js +3 -0
- package/build/metadata/scraper.js +85 -14
- package/build/python/asset_downloader_engine.py +2 -0
- package/build/python/convert_engine.py +92 -0
- package/build/python/export_engine.py +45 -0
- package/build/python/kaggle_engine.py +77 -5
- package/build/python/normalize_engine.py +83 -0
- package/build/python/vesper/core/asset_downloader.py +5 -1
- package/build/search/engine.js +43 -5
- package/build/search/jit-orchestrator.js +18 -14
- package/build/search/query-intent.js +509 -0
- package/build/tools/formatter.js +6 -3
- package/build/utils/python-runtime.js +130 -0
- package/package.json +7 -5
- package/scripts/postinstall.cjs +87 -31
- package/scripts/wizard.cjs +601 -0
- package/scripts/wizard.js +306 -12
- package/src/python/__pycache__/config.cpython-312.pyc +0 -0
- package/src/python/__pycache__/kaggle_engine.cpython-312.pyc +0 -0
- package/src/python/asset_downloader_engine.py +2 -0
- package/src/python/convert_engine.py +92 -0
- package/src/python/export_engine.py +45 -0
- package/src/python/kaggle_engine.py +77 -5
- package/src/python/normalize_engine.py +83 -0
- package/src/python/requirements.txt +12 -0
- package/src/python/vesper/core/asset_downloader.py +5 -1
- package/wizard.cjs +3 -0
package/scripts/wizard.js
CHANGED
|
@@ -10,6 +10,9 @@ const path = require('path');
|
|
|
10
10
|
const os = require('os');
|
|
11
11
|
const crypto = require('crypto');
|
|
12
12
|
const { execSync, spawnSync } = require('child_process');
|
|
13
|
+
const http = require('http');
|
|
14
|
+
const https = require('https');
|
|
15
|
+
const readline = require('readline');
|
|
13
16
|
|
|
14
17
|
// ── Paths ────────────────────────────────────────────────────
|
|
15
18
|
const HOME = os.homedir();
|
|
@@ -54,6 +57,271 @@ function yellow(text) { return `\x1b[33m${text}\x1b[0m`; }
|
|
|
54
57
|
function red(text) { return `\x1b[31m${text}\x1b[0m`; }
|
|
55
58
|
function magenta(text) { return `\x1b[35m${text}\x1b[0m`; }
|
|
56
59
|
|
|
60
|
+
// ── Vesper API URL resolution ────────────────────────────────
|
|
61
|
+
const VESPER_API_URL = process.env.VESPER_API_URL || '';
|
|
62
|
+
const DEFAULT_VESPER_API_CANDIDATES = [
|
|
63
|
+
'https://getvesper.dev',
|
|
64
|
+
'http://localhost:3000',
|
|
65
|
+
'http://127.0.0.1:3000',
|
|
66
|
+
];
|
|
67
|
+
|
|
68
|
+
// ── Device Auth Helpers ──────────────────────────────────────
|
|
69
|
+
function httpJson(method, url, body) {
|
|
70
|
+
return new Promise((resolve, reject) => {
|
|
71
|
+
const parsed = new URL(url);
|
|
72
|
+
const lib = parsed.protocol === 'https:' ? https : http;
|
|
73
|
+
const opts = {
|
|
74
|
+
method,
|
|
75
|
+
hostname: parsed.hostname,
|
|
76
|
+
port: parsed.port || (parsed.protocol === 'https:' ? 443 : 80),
|
|
77
|
+
path: parsed.pathname + parsed.search,
|
|
78
|
+
headers: { 'Content-Type': 'application/json' },
|
|
79
|
+
};
|
|
80
|
+
const req = lib.request(opts, (res) => {
|
|
81
|
+
let data = '';
|
|
82
|
+
res.on('data', (chunk) => (data += chunk));
|
|
83
|
+
res.on('end', () => {
|
|
84
|
+
try { resolve({ status: res.statusCode, body: JSON.parse(data) }); }
|
|
85
|
+
catch { resolve({ status: res.statusCode, body: data }); }
|
|
86
|
+
});
|
|
87
|
+
});
|
|
88
|
+
req.on('error', reject);
|
|
89
|
+
if (body) req.write(JSON.stringify(body));
|
|
90
|
+
req.end();
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async function probeDeviceAuth(baseUrl) {
|
|
95
|
+
try {
|
|
96
|
+
const res = await httpJson('POST', `${baseUrl}/api/auth/device/start`);
|
|
97
|
+
if (res.status === 201 && !!res.body && !!res.body.code) {
|
|
98
|
+
return { baseUrl, status: 'ready', response: res.body };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
if (res.status === 503 && res.body && res.body.requiresSetup) {
|
|
102
|
+
return {
|
|
103
|
+
baseUrl,
|
|
104
|
+
status: 'setup-required',
|
|
105
|
+
response: res.body,
|
|
106
|
+
message: res.body.error || 'Auth storage is not initialized.',
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
baseUrl,
|
|
112
|
+
status: 'unreachable',
|
|
113
|
+
response: res.body,
|
|
114
|
+
message: typeof res.body === 'string' ? res.body : JSON.stringify(res.body),
|
|
115
|
+
};
|
|
116
|
+
} catch (error) {
|
|
117
|
+
return {
|
|
118
|
+
baseUrl,
|
|
119
|
+
status: 'unreachable',
|
|
120
|
+
message: error && error.message ? error.message : 'Request failed',
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async function resolveVesperApiBaseUrl() {
|
|
126
|
+
const candidates = VESPER_API_URL
|
|
127
|
+
? [VESPER_API_URL]
|
|
128
|
+
: DEFAULT_VESPER_API_CANDIDATES;
|
|
129
|
+
|
|
130
|
+
let setupRequiredProbe = null;
|
|
131
|
+
|
|
132
|
+
for (const candidate of candidates) {
|
|
133
|
+
const probe = await probeDeviceAuth(candidate);
|
|
134
|
+
if (probe.status === 'ready') {
|
|
135
|
+
return probe;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (!setupRequiredProbe && probe.status === 'setup-required') {
|
|
139
|
+
setupRequiredProbe = probe;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return setupRequiredProbe;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function openBrowser(url) {
|
|
147
|
+
try {
|
|
148
|
+
if (process.platform === 'win32') {
|
|
149
|
+
spawnSync('cmd', ['/c', 'start', '', url], { stdio: 'ignore' });
|
|
150
|
+
} else if (process.platform === 'darwin') {
|
|
151
|
+
spawnSync('open', [url], { stdio: 'ignore' });
|
|
152
|
+
} else {
|
|
153
|
+
spawnSync('xdg-open', [url], { stdio: 'ignore' });
|
|
154
|
+
}
|
|
155
|
+
} catch { /* browser open is best-effort */ }
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function askYesNo(question) {
|
|
159
|
+
return new Promise((resolve) => {
|
|
160
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
161
|
+
rl.question(` ${question} ${dim('[Y/n]')} `, (answer) => {
|
|
162
|
+
rl.close();
|
|
163
|
+
resolve(!answer || answer.toLowerCase().startsWith('y'));
|
|
164
|
+
});
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
function askInput(question) {
|
|
169
|
+
return new Promise((resolve) => {
|
|
170
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
171
|
+
rl.question(` ${question} `, (answer) => {
|
|
172
|
+
rl.close();
|
|
173
|
+
resolve(String(answer || '').trim());
|
|
174
|
+
});
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
async function askChoice(question, choices, defaultValue) {
|
|
179
|
+
console.log(` ${question}`);
|
|
180
|
+
choices.forEach((choice, index) => {
|
|
181
|
+
console.log(` ${dim(String(index + 1) + ')')} ${choice.label}`);
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
const prompt = defaultValue ? `${dim('[default: ' + defaultValue + ']')}` : '';
|
|
185
|
+
const answer = await askInput(`${prompt} ${cyan('→')} Choose an option:`);
|
|
186
|
+
if (!answer && defaultValue) {
|
|
187
|
+
return defaultValue;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const numeric = Number(answer);
|
|
191
|
+
if (Number.isFinite(numeric) && numeric >= 1 && numeric <= choices.length) {
|
|
192
|
+
return choices[numeric - 1].value;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
const matched = choices.find((choice) => choice.value === answer);
|
|
196
|
+
return matched ? matched.value : defaultValue;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
function isCloudApiKey(value) {
|
|
200
|
+
return !!value && value.startsWith('vesper_sk_') && !value.startsWith('vesper_sk_local_');
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
async function promptForManualApiKey() {
|
|
204
|
+
console.log(`\n ${cyan('■')} ${bold('Manual API Key')}`);
|
|
205
|
+
console.log(` ${dim('Paste a Vesper cloud API key. It will be stored locally in config.toml.\n')}`);
|
|
206
|
+
|
|
207
|
+
while (true) {
|
|
208
|
+
const value = await askInput(`${cyan('→')} Vesper API key:`);
|
|
209
|
+
if (isCloudApiKey(value)) {
|
|
210
|
+
return value;
|
|
211
|
+
}
|
|
212
|
+
console.log(` ${yellow('!')} ${yellow('Expected a Vesper key starting with vesper_sk_')}`);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
async function chooseAuthMode(existingKey, existingAuthMode) {
|
|
217
|
+
const hasExistingKey = !!existingKey;
|
|
218
|
+
if (hasExistingKey) {
|
|
219
|
+
console.log(` ${dim('Current key:')} ${dim(existingKey.slice(0, 24) + '...')}`);
|
|
220
|
+
console.log(` ${dim('Current mode:')} ${dim(existingAuthMode || (isCloudApiKey(existingKey) ? 'cloud' : 'local_unified'))}`);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
const choices = [];
|
|
224
|
+
choices.push({ value: 'browser', label: 'Sign in through the browser' });
|
|
225
|
+
choices.push({ value: 'manual', label: 'Provide Vesper API key manually' });
|
|
226
|
+
|
|
227
|
+
return await askChoice(`${cyan('→')} How do you want to authenticate Vesper?`, choices, 'browser');
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
async function deviceAuthFlow() {
|
|
231
|
+
console.log(`\n ${cyan('■')} ${bold('Device Authentication')}`);
|
|
232
|
+
console.log(` ${dim('Link your CLI to a Vesper account for cloud features\n')}`);
|
|
233
|
+
|
|
234
|
+
const resolvedApiBaseUrl = await resolveVesperApiBaseUrl();
|
|
235
|
+
if (!resolvedApiBaseUrl) {
|
|
236
|
+
console.log(` ${red('✗')} ${red('Could not reach any Vesper auth endpoint.')}`);
|
|
237
|
+
console.log(` ${dim('Tried:')} ${dim((VESPER_API_URL ? [VESPER_API_URL] : DEFAULT_VESPER_API_CANDIDATES).join(', '))}`);
|
|
238
|
+
console.log(` ${dim('If your landing app is running locally, start it on http://localhost:3000 or set VESPER_API_URL.')}`);
|
|
239
|
+
console.log(` ${dim('Falling back to manual key entry.\n')}`);
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
if (resolvedApiBaseUrl.status === 'setup-required') {
|
|
244
|
+
console.log(` ${yellow('!')} ${yellow('Reached Vesper auth endpoint, but local auth storage is not initialized.')}`);
|
|
245
|
+
console.log(` ${dim('Endpoint:')} ${dim(resolvedApiBaseUrl.baseUrl)}`);
|
|
246
|
+
console.log(` ${dim('Reason:')} ${dim(resolvedApiBaseUrl.message || 'Apply Supabase migrations first.')}`);
|
|
247
|
+
console.log(` ${dim('Run the SQL in supabase/migrations/001_device_auth.sql and 002_rate_limits.sql, then retry.')}`);
|
|
248
|
+
console.log(` ${dim('Falling back to manual key entry.\n')}`);
|
|
249
|
+
return null;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
console.log(` ${dim('Auth endpoint:')} ${dim(resolvedApiBaseUrl.baseUrl)}\n`);
|
|
253
|
+
|
|
254
|
+
// Step 1: Call /api/auth/device/start
|
|
255
|
+
process.stdout.write(` ${dim('Requesting device code...')}`);
|
|
256
|
+
let startRes;
|
|
257
|
+
try {
|
|
258
|
+
startRes = await httpJson('POST', `${resolvedApiBaseUrl.baseUrl}/api/auth/device/start`);
|
|
259
|
+
} catch (err) {
|
|
260
|
+
console.log(` ${red('✗')}`);
|
|
261
|
+
console.log(` ${red('Could not reach Vesper API at')} ${dim(resolvedApiBaseUrl.baseUrl)}`);
|
|
262
|
+
console.log(` ${dim('Falling back to manual key entry.\n')}`);
|
|
263
|
+
return null;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
if (startRes.status !== 201 || !startRes.body.code) {
|
|
267
|
+
console.log(` ${red('✗')}`);
|
|
268
|
+
console.log(` ${red('Unexpected response:')} ${dim(JSON.stringify(startRes.body))}`);
|
|
269
|
+
return null;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
const { code, loginUrl } = startRes.body;
|
|
273
|
+
console.log(` ${green('✓')}\n`);
|
|
274
|
+
|
|
275
|
+
// Step 2: Display code and open browser
|
|
276
|
+
console.log(` ┌───────────────────────────────────────────────┐`);
|
|
277
|
+
console.log(` │ │`);
|
|
278
|
+
console.log(` │ ${bold('Your device code:')} ${cyan(bold(code))} │`);
|
|
279
|
+
console.log(` │ │`);
|
|
280
|
+
console.log(` │ ${dim('Open this URL to sign in:')} │`);
|
|
281
|
+
console.log(` │ ${cyan(loginUrl.padEnd(41))}│`);
|
|
282
|
+
console.log(` │ │`);
|
|
283
|
+
console.log(` └───────────────────────────────────────────────┘\n`);
|
|
284
|
+
|
|
285
|
+
openBrowser(loginUrl);
|
|
286
|
+
console.log(` ${dim('Browser opened automatically.')}`);
|
|
287
|
+
console.log(` ${dim('Waiting for you to sign in...')}\n`);
|
|
288
|
+
|
|
289
|
+
// Step 3: Poll until confirmed or expired
|
|
290
|
+
const POLL_INTERVAL = 3000; // 3 seconds
|
|
291
|
+
const MAX_POLLS = 200; // 10 min max (200 × 3s)
|
|
292
|
+
let polls = 0;
|
|
293
|
+
const spinner = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
|
|
294
|
+
|
|
295
|
+
while (polls < MAX_POLLS) {
|
|
296
|
+
polls++;
|
|
297
|
+
const frame = spinner[polls % spinner.length];
|
|
298
|
+
process.stdout.write(`\r ${cyan(frame)} Polling... (${polls})`);
|
|
299
|
+
|
|
300
|
+
try {
|
|
301
|
+
const pollRes = await httpJson('GET', `${resolvedApiBaseUrl.baseUrl}/api/auth/device/poll?code=${code}`);
|
|
302
|
+
|
|
303
|
+
if (pollRes.body.status === 'confirmed' && pollRes.body.apiKey) {
|
|
304
|
+
process.stdout.write(`\r ${green('✓')} Device authenticated! \n`);
|
|
305
|
+
console.log(` ${dim('Email:')} ${pollRes.body.email || 'linked'}`);
|
|
306
|
+
return pollRes.body.apiKey;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
if (pollRes.body.status === 'expired') {
|
|
310
|
+
process.stdout.write(`\r ${red('✗')} Device code expired. \n`);
|
|
311
|
+
console.log(` ${dim('Run the wizard again to get a new code.')}`);
|
|
312
|
+
return null;
|
|
313
|
+
}
|
|
314
|
+
} catch {
|
|
315
|
+
// Network hiccup — keep polling
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
await new Promise((r) => setTimeout(r, POLL_INTERVAL));
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
process.stdout.write(`\r ${red('✗')} Timed out waiting for authentication.\n`);
|
|
322
|
+
return null;
|
|
323
|
+
}
|
|
324
|
+
|
|
57
325
|
function printBanner() {
|
|
58
326
|
console.log(`
|
|
59
327
|
${dim('─────────────────────────────────────────────────')}
|
|
@@ -182,21 +450,42 @@ async function main() {
|
|
|
182
450
|
ensureDir(path.join(VESPER_DIR, 'datasets'));
|
|
183
451
|
console.log(` ${green('✓')}`);
|
|
184
452
|
|
|
185
|
-
// ─── Step 2:
|
|
186
|
-
|
|
453
|
+
// ─── Step 2: Authenticate (device flow or local key) ──────
|
|
454
|
+
console.log(`\n ${dim('[')}${cyan('2/6')}${dim(']')} Authentication`);
|
|
455
|
+
|
|
187
456
|
const existing = readToml(CONFIG_TOML);
|
|
188
|
-
|
|
189
|
-
|
|
457
|
+
let localKey = existing.api_key || '';
|
|
458
|
+
let authMode = existing.auth_mode || '';
|
|
459
|
+
|
|
460
|
+
const authChoice = await chooseAuthMode(localKey, authMode);
|
|
461
|
+
|
|
462
|
+
if (authChoice === 'manual') {
|
|
463
|
+
localKey = await promptForManualApiKey();
|
|
464
|
+
authMode = 'cloud';
|
|
465
|
+
console.log(` ${green('✓')} Cloud API key saved from manual input`);
|
|
466
|
+
} else if (authChoice === 'browser') {
|
|
467
|
+
const cloudKey = await deviceAuthFlow();
|
|
468
|
+
if (cloudKey) {
|
|
469
|
+
localKey = cloudKey;
|
|
470
|
+
authMode = 'cloud';
|
|
471
|
+
} else {
|
|
472
|
+
console.log(`\n ${yellow('!')} Browser sign-in did not complete. Falling back to manual key entry.`);
|
|
473
|
+
localKey = await promptForManualApiKey();
|
|
474
|
+
authMode = 'cloud';
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
const configData = { ...existing, api_key: localKey, auth_mode: authMode };
|
|
190
479
|
writeToml(CONFIG_TOML, configData);
|
|
191
|
-
console.log(` ${
|
|
192
|
-
console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 20) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
|
|
480
|
+
console.log(` ${dim('Key:')} ${dim(localKey.slice(0, 24) + '...')} ${dim('→')} ${dim(CONFIG_TOML)}`);
|
|
193
481
|
|
|
194
482
|
// ─── Step 3: Local vault initialization ────────────────────
|
|
195
483
|
process.stdout.write(`\n ${dim('[')}${cyan('3/6')}${dim(']')} Initializing local credentials vault...`);
|
|
196
|
-
|
|
197
|
-
|
|
484
|
+
const vaultData = readToml(CONFIG_TOML);
|
|
485
|
+
if (!vaultData.auth_mode) vaultData.auth_mode = 'local_unified';
|
|
486
|
+
writeToml(CONFIG_TOML, vaultData);
|
|
198
487
|
console.log(` ${green('✓')}`);
|
|
199
|
-
console.log(` ${dim('Mode:')} ${dim('single local Vesper key (no external keys required)')}`);
|
|
488
|
+
console.log(` ${dim('Mode:')} ${dim(vaultData.auth_mode === 'cloud' ? 'cloud (linked to Vesper account)' : 'single local Vesper key (no external keys required)')}`);
|
|
200
489
|
|
|
201
490
|
// ─── Step 4: Install @vespermcp/mcp-server ─────────────────
|
|
202
491
|
console.log(`\n ${dim('[')}${cyan('4/6')}${dim(']')} Installing Vesper MCP server...`);
|
|
@@ -251,19 +540,24 @@ async function main() {
|
|
|
251
540
|
console.log(` ${configuredAgents.length > 0 ? green('✓') : yellow('⚠')} MCP agents ${dim(configuredAgents.length + ' configured')}`);
|
|
252
541
|
|
|
253
542
|
// ─── Final Summary ─────────────────────────────────────────
|
|
543
|
+
const finalConfig = readToml(CONFIG_TOML);
|
|
544
|
+
const isCloud = finalConfig.auth_mode === 'cloud';
|
|
254
545
|
console.log(`
|
|
255
546
|
${dim('═════════════════════════════════════════════════')}
|
|
256
547
|
|
|
257
548
|
${green(bold('✓ Vesper is ready!'))}
|
|
258
549
|
|
|
259
|
-
${bold('Your local API key:')}
|
|
260
|
-
${cyan(localKey)}
|
|
550
|
+
${bold(isCloud ? 'Your cloud API key:' : 'Your local API key:')}
|
|
551
|
+
${cyan(finalConfig.api_key || localKey)}
|
|
552
|
+
|
|
553
|
+
${bold('Auth mode:')}
|
|
554
|
+
${dim(isCloud ? '☁ Cloud (linked to Vesper account)' : '🔑 Local-only (key never leaves your machine)')}
|
|
261
555
|
|
|
262
556
|
${bold('Config file:')}
|
|
263
557
|
${dim(CONFIG_TOML)}
|
|
264
558
|
|
|
265
559
|
${bold('What just happened:')}
|
|
266
|
-
${dim('1.')} Generated a local API key (never leaves your machine)
|
|
560
|
+
${dim('1.')} ${isCloud ? 'Linked to your Vesper cloud account' : 'Generated a local API key (never leaves your machine)'}
|
|
267
561
|
${dim('2.')} Initialized local credentials vault
|
|
268
562
|
${dim('3.')} Auto-configured MCP for ${configuredAgents.length > 0 ? configuredAgents.join(', ') : 'detected agents'}
|
|
269
563
|
${dim('4.')} Vesper server ready on stdio transport
|
|
Binary file
|
|
Binary file
|
|
@@ -26,6 +26,7 @@ def _print(payload: Dict[str, Any]) -> None:
|
|
|
26
26
|
async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
27
27
|
payload = json.loads(args.payload)
|
|
28
28
|
output_root = payload.get("output_root") or str(Path.home() / ".vesper" / "data" / "assets")
|
|
29
|
+
output_dir = payload.get("output_dir")
|
|
29
30
|
workers = int(payload.get("workers") or 8)
|
|
30
31
|
recipes_dir = payload.get("recipes_dir")
|
|
31
32
|
|
|
@@ -43,6 +44,7 @@ async def _run_download(args: argparse.Namespace) -> Dict[str, Any]:
|
|
|
43
44
|
kaggle_ref=payload.get("kaggle_ref"),
|
|
44
45
|
urls=payload.get("urls"),
|
|
45
46
|
output_format=payload.get("output_format", "webdataset"),
|
|
47
|
+
output_dir=str(output_dir) if output_dir else None,
|
|
46
48
|
max_items=payload.get("max_items"),
|
|
47
49
|
image_column=payload.get("image_column"),
|
|
48
50
|
)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Convert a dataset file between formats (CSV, Parquet, JSON, JSONL).
|
|
3
|
+
Usage: convert_engine.py <input_path> <output_path>
|
|
4
|
+
Outputs JSON: {"ok": true, "output_path": "...", "rows": N, "columns": N} or {"ok": false, "error": "..."}
|
|
5
|
+
"""
|
|
6
|
+
import sys
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
import polars as pl
|
|
12
|
+
except Exception:
|
|
13
|
+
print(json.dumps({"ok": False, "error": "polars is required. Install with: pip install polars"}))
|
|
14
|
+
sys.exit(1)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _load(src: str) -> pl.DataFrame:
|
|
18
|
+
ext = os.path.splitext(src)[1].lower()
|
|
19
|
+
if ext == ".csv":
|
|
20
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
21
|
+
if ext in (".tsv", ".tab"):
|
|
22
|
+
return pl.read_csv(src, separator="\t", ignore_errors=True, infer_schema_length=10000)
|
|
23
|
+
if ext in (".parquet", ".pq"):
|
|
24
|
+
return pl.read_parquet(src)
|
|
25
|
+
if ext in (".feather", ".ftr", ".arrow", ".ipc"):
|
|
26
|
+
return pl.read_ipc(src)
|
|
27
|
+
if ext in (".jsonl", ".ndjson"):
|
|
28
|
+
return pl.read_ndjson(src)
|
|
29
|
+
if ext == ".json":
|
|
30
|
+
raw = open(src, "r", encoding="utf-8").read().strip()
|
|
31
|
+
if raw.startswith("["):
|
|
32
|
+
return pl.read_json(src)
|
|
33
|
+
if "\n" in raw and raw.split("\n")[0].strip().startswith("{"):
|
|
34
|
+
return pl.read_ndjson(src)
|
|
35
|
+
obj = json.loads(raw)
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
for key in ("data", "rows", "items", "records", "results", "entries", "samples"):
|
|
38
|
+
if key in obj and isinstance(obj[key], list):
|
|
39
|
+
return pl.DataFrame(obj[key])
|
|
40
|
+
for v in obj.values():
|
|
41
|
+
if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
|
|
42
|
+
return pl.DataFrame(v)
|
|
43
|
+
return pl.read_json(src)
|
|
44
|
+
# Fallback: try csv
|
|
45
|
+
return pl.read_csv(src, ignore_errors=True, infer_schema_length=10000)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _write(df: pl.DataFrame, dst: str) -> None:
|
|
49
|
+
ext = os.path.splitext(dst)[1].lower()
|
|
50
|
+
os.makedirs(os.path.dirname(dst) or ".", exist_ok=True)
|
|
51
|
+
if ext in (".parquet", ".pq"):
|
|
52
|
+
df.write_parquet(dst)
|
|
53
|
+
elif ext == ".csv":
|
|
54
|
+
df.write_csv(dst)
|
|
55
|
+
elif ext == ".json":
|
|
56
|
+
df.write_json(dst, row_oriented=True)
|
|
57
|
+
elif ext in (".jsonl", ".ndjson"):
|
|
58
|
+
df.write_ndjson(dst)
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Unsupported output format: {ext}")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def main():
|
|
64
|
+
if len(sys.argv) < 3:
|
|
65
|
+
print(json.dumps({"ok": False, "error": "Usage: convert_engine.py <input> <output>"}))
|
|
66
|
+
sys.exit(1)
|
|
67
|
+
|
|
68
|
+
input_path = sys.argv[1]
|
|
69
|
+
output_path = sys.argv[2]
|
|
70
|
+
|
|
71
|
+
if not os.path.exists(input_path):
|
|
72
|
+
print(json.dumps({"ok": False, "error": f"File not found: {input_path}"}))
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
df = _load(input_path)
|
|
77
|
+
_write(df, output_path)
|
|
78
|
+
size_mb = round(os.path.getsize(output_path) / (1024 * 1024), 2)
|
|
79
|
+
print(json.dumps({
|
|
80
|
+
"ok": True,
|
|
81
|
+
"output_path": output_path,
|
|
82
|
+
"rows": df.height,
|
|
83
|
+
"columns": df.width,
|
|
84
|
+
"size_mb": size_mb,
|
|
85
|
+
}))
|
|
86
|
+
except Exception as e:
|
|
87
|
+
print(json.dumps({"ok": False, "error": str(e)}))
|
|
88
|
+
sys.exit(1)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
if __name__ == "__main__":
|
|
92
|
+
main()
|
|
@@ -50,6 +50,51 @@ def _load(file_path: str, options: dict) -> pl.DataFrame:
|
|
|
50
50
|
df = pl.read_ipc(file_path)
|
|
51
51
|
elif ext == ".jsonl":
|
|
52
52
|
df = pl.read_ndjson(file_path)
|
|
53
|
+
elif ext == ".json":
|
|
54
|
+
# Auto-detect: array-of-objects vs NDJSON vs nested structures
|
|
55
|
+
try:
|
|
56
|
+
import json as _json
|
|
57
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
58
|
+
raw_text = fh.read(512) # peek
|
|
59
|
+
stripped = raw_text.lstrip()
|
|
60
|
+
if stripped.startswith("["):
|
|
61
|
+
# Array of objects — standard JSON
|
|
62
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
63
|
+
data = _json.load(fh)
|
|
64
|
+
if isinstance(data, list) and len(data) > 0:
|
|
65
|
+
df = pl.DataFrame(data)
|
|
66
|
+
else:
|
|
67
|
+
raise ValueError("JSON file is empty or not an array of objects")
|
|
68
|
+
elif stripped.startswith("{"):
|
|
69
|
+
# Could be NDJSON or a single object wrapping rows
|
|
70
|
+
try:
|
|
71
|
+
df = pl.read_ndjson(file_path)
|
|
72
|
+
except Exception:
|
|
73
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as fh:
|
|
74
|
+
data = _json.load(fh)
|
|
75
|
+
# Try common wrapper patterns: {"data": [...]}, {"rows": [...]}, etc.
|
|
76
|
+
rows = None
|
|
77
|
+
if isinstance(data, dict):
|
|
78
|
+
for key in ("data", "rows", "records", "items", "results", "entries"):
|
|
79
|
+
if key in data and isinstance(data[key], list):
|
|
80
|
+
rows = data[key]
|
|
81
|
+
break
|
|
82
|
+
if rows is None:
|
|
83
|
+
# Last resort: try to use the dict values
|
|
84
|
+
rows = [data]
|
|
85
|
+
if rows and len(rows) > 0:
|
|
86
|
+
df = pl.DataFrame(rows)
|
|
87
|
+
else:
|
|
88
|
+
raise ValueError("Could not parse JSON structure into tabular data")
|
|
89
|
+
else:
|
|
90
|
+
raise ValueError("JSON file does not start with [ or {")
|
|
91
|
+
except pl.exceptions.ComputeError as ce:
|
|
92
|
+
raise ValueError(f"Failed to parse JSON: {ce}")
|
|
93
|
+
elif ext == ".xlsx":
|
|
94
|
+
try:
|
|
95
|
+
df = pl.read_excel(file_path)
|
|
96
|
+
except Exception as e:
|
|
97
|
+
raise ValueError(f"Failed to read Excel file: {e}")
|
|
53
98
|
else:
|
|
54
99
|
raise ValueError(f"Unsupported input format: {ext}")
|
|
55
100
|
|
|
@@ -12,6 +12,19 @@ except Exception:
|
|
|
12
12
|
HAS_KAGGLE = False
|
|
13
13
|
|
|
14
14
|
|
|
15
|
+
IMAGE_EXTENSIONS = {
|
|
16
|
+
".jpg",
|
|
17
|
+
".jpeg",
|
|
18
|
+
".png",
|
|
19
|
+
".webp",
|
|
20
|
+
".bmp",
|
|
21
|
+
".gif",
|
|
22
|
+
".tiff",
|
|
23
|
+
".tif",
|
|
24
|
+
".svg",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
15
28
|
def _ensure_auth() -> Dict[str, Any]:
|
|
16
29
|
if not HAS_KAGGLE:
|
|
17
30
|
return {
|
|
@@ -135,7 +148,56 @@ def discover(query: str, limit: int = 20) -> Dict[str, Any]:
|
|
|
135
148
|
return {"ok": False, "error": f"Kaggle discover failed: {str(e)}"}
|
|
136
149
|
|
|
137
150
|
|
|
138
|
-
def
|
|
151
|
+
def _find_image_files(root: str) -> List[str]:
|
|
152
|
+
image_files: List[str] = []
|
|
153
|
+
for base, _, files in os.walk(root):
|
|
154
|
+
for name in files:
|
|
155
|
+
full = os.path.join(base, name)
|
|
156
|
+
if os.path.splitext(name)[1].lower() in IMAGE_EXTENSIONS:
|
|
157
|
+
image_files.append(full)
|
|
158
|
+
image_files.sort()
|
|
159
|
+
return image_files
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _infer_image_record(root: str, full_path: str, index: int) -> Dict[str, Any]:
|
|
163
|
+
relative_path = os.path.relpath(full_path, root).replace("\\", "/")
|
|
164
|
+
parent_dir = os.path.dirname(relative_path)
|
|
165
|
+
parts = [part for part in parent_dir.split("/") if part and part != "."]
|
|
166
|
+
|
|
167
|
+
split = None
|
|
168
|
+
label = None
|
|
169
|
+
if parts:
|
|
170
|
+
first = parts[0].lower()
|
|
171
|
+
if first in {"train", "test", "val", "valid", "validation"}:
|
|
172
|
+
split = parts[0]
|
|
173
|
+
if len(parts) > 1:
|
|
174
|
+
label = parts[-1]
|
|
175
|
+
else:
|
|
176
|
+
label = parts[-1]
|
|
177
|
+
|
|
178
|
+
record: Dict[str, Any] = {
|
|
179
|
+
"id": index,
|
|
180
|
+
"image_path": os.path.abspath(full_path),
|
|
181
|
+
"relative_path": relative_path,
|
|
182
|
+
"file_name": os.path.basename(full_path),
|
|
183
|
+
"extension": os.path.splitext(full_path)[1].lower().lstrip("."),
|
|
184
|
+
}
|
|
185
|
+
if split:
|
|
186
|
+
record["split"] = split
|
|
187
|
+
if label:
|
|
188
|
+
record["label"] = label
|
|
189
|
+
return record
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _write_image_manifest(root: str, image_files: List[str]) -> str:
|
|
193
|
+
manifest_path = os.path.join(root, "_vesper_image_manifest.jsonl")
|
|
194
|
+
with open(manifest_path, "w", encoding="utf-8") as handle:
|
|
195
|
+
for index, full_path in enumerate(image_files):
|
|
196
|
+
handle.write(json.dumps(_infer_image_record(root, full_path, index), ensure_ascii=False) + "\n")
|
|
197
|
+
return manifest_path
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _pick_best_file(root: str) -> Dict[str, Any]:
|
|
139
201
|
candidates: List[str] = []
|
|
140
202
|
for base, _, files in os.walk(root):
|
|
141
203
|
for name in files:
|
|
@@ -145,6 +207,14 @@ def _pick_best_file(root: str) -> str:
|
|
|
145
207
|
candidates.append(full)
|
|
146
208
|
|
|
147
209
|
if not candidates:
|
|
210
|
+
image_files = _find_image_files(root)
|
|
211
|
+
if image_files:
|
|
212
|
+
manifest_path = _write_image_manifest(root, image_files)
|
|
213
|
+
return {
|
|
214
|
+
"local_path": manifest_path,
|
|
215
|
+
"dataset_kind": "image-manifest",
|
|
216
|
+
"image_count": len(image_files),
|
|
217
|
+
}
|
|
148
218
|
raise RuntimeError("No suitable data file found after download")
|
|
149
219
|
|
|
150
220
|
# prioritize common tabular formats
|
|
@@ -152,8 +222,8 @@ def _pick_best_file(root: str) -> str:
|
|
|
152
222
|
for ext in priorities:
|
|
153
223
|
for c in candidates:
|
|
154
224
|
if c.lower().endswith(ext):
|
|
155
|
-
return c
|
|
156
|
-
return candidates[0]
|
|
225
|
+
return {"local_path": c, "dataset_kind": "tabular", "image_count": 0}
|
|
226
|
+
return {"local_path": candidates[0], "dataset_kind": "tabular", "image_count": 0}
|
|
157
227
|
|
|
158
228
|
|
|
159
229
|
def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
@@ -174,12 +244,14 @@ def download(dataset_ref: str, target_dir: str) -> Dict[str, Any]:
|
|
|
174
244
|
|
|
175
245
|
# unzip in place, remove zip for convenience
|
|
176
246
|
api.dataset_download_files(dataset_ref, path=target_dir, unzip=True, quiet=True)
|
|
177
|
-
|
|
247
|
+
artifact = _pick_best_file(target_dir)
|
|
178
248
|
return {
|
|
179
249
|
"ok": True,
|
|
180
250
|
"dataset_id": dataset_ref,
|
|
181
251
|
"target_dir": target_dir,
|
|
182
|
-
"local_path":
|
|
252
|
+
"local_path": artifact["local_path"],
|
|
253
|
+
"dataset_kind": artifact["dataset_kind"],
|
|
254
|
+
"image_count": artifact.get("image_count", 0),
|
|
183
255
|
}
|
|
184
256
|
except Exception as e:
|
|
185
257
|
msg = str(e)
|