@mariozechner/pi 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +317 -0
- package/package.json +42 -0
- package/pi +860 -0
- package/pod_setup.sh +133 -0
- package/vllm_manager.py +499 -0
package/pi
ADDED
|
@@ -0,0 +1,860 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Prime Intellect CLI - All-in-one pod management
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const fs = require('fs');
|
|
7
|
+
const { execSync, spawn } = require('child_process');
|
|
8
|
+
const path = require('path');
|
|
9
|
+
const os = require('os');
|
|
10
|
+
|
|
11
|
+
const CONFIG_FILE = path.join(os.homedir(), '.pi_config');
|
|
12
|
+
const SCRIPT_DIR = __dirname;
|
|
13
|
+
|
|
14
|
+
class PrimeIntellectCLI {
|
|
15
|
+
constructor() {
|
|
16
|
+
this.loadConfig();
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
loadConfig() {
|
|
20
|
+
if (fs.existsSync(CONFIG_FILE)) {
|
|
21
|
+
this.config = JSON.parse(fs.readFileSync(CONFIG_FILE, 'utf8'));
|
|
22
|
+
// Migrate old single-pod config
|
|
23
|
+
if (this.config.ssh && !this.config.pods) {
|
|
24
|
+
this.config = {
|
|
25
|
+
pods: { 'default': { ssh: this.config.ssh } },
|
|
26
|
+
active: 'default'
|
|
27
|
+
};
|
|
28
|
+
this.saveConfig();
|
|
29
|
+
}
|
|
30
|
+
} else {
|
|
31
|
+
this.config = { pods: {}, active: null };
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
saveConfig() {
|
|
36
|
+
fs.writeFileSync(CONFIG_FILE, JSON.stringify(this.config, null, 2));
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
getActivePod() {
|
|
40
|
+
if (!this.config.active || !this.config.pods[this.config.active]) {
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
return this.config.pods[this.config.active];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
ssh(command, interactive = false, skipPirc = false) {
|
|
47
|
+
const pod = this.getActivePod();
|
|
48
|
+
if (!pod) {
|
|
49
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
50
|
+
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
51
|
+
console.error('Or activate an existing pod: pi pod <pod-name>');
|
|
52
|
+
process.exit(1);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Wrap command to source .pirc first (if it exists), unless skipPirc is true
|
|
56
|
+
const finalCommand = skipPirc ? command : `[ -f ~/.pirc ] && source ~/.pirc; ${command}`;
|
|
57
|
+
|
|
58
|
+
if (interactive) {
|
|
59
|
+
// For interactive commands, use spawn with shell
|
|
60
|
+
const sshParts = pod.ssh.split(' ');
|
|
61
|
+
const sshCmd = ['ssh', ...sshParts, finalCommand];
|
|
62
|
+
const proc = spawn(sshCmd[0], sshCmd.slice(1), { stdio: 'inherit', shell: false });
|
|
63
|
+
return new Promise((resolve) => {
|
|
64
|
+
proc.on('close', resolve);
|
|
65
|
+
});
|
|
66
|
+
} else {
|
|
67
|
+
const sshCmd = `ssh ${pod.ssh} ${JSON.stringify(finalCommand)}`;
|
|
68
|
+
|
|
69
|
+
// For non-interactive, use execSync
|
|
70
|
+
try {
|
|
71
|
+
return execSync(sshCmd, { encoding: 'utf8' });
|
|
72
|
+
} catch (e) {
|
|
73
|
+
if (e.status !== 0) {
|
|
74
|
+
console.error('SSH command failed:', e.message);
|
|
75
|
+
process.exit(1);
|
|
76
|
+
}
|
|
77
|
+
throw e;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
scp(localFile, remotePath = '~/') {
|
|
83
|
+
const pod = this.getActivePod();
|
|
84
|
+
if (!pod) {
|
|
85
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
86
|
+
process.exit(1);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const [userHost, ...sshArgs] = pod.ssh.split(' ');
|
|
90
|
+
let scpCmd = `scp`;
|
|
91
|
+
|
|
92
|
+
// Add port if specified
|
|
93
|
+
const portArg = sshArgs.find(arg => arg === '-p');
|
|
94
|
+
if (portArg) {
|
|
95
|
+
const portIndex = sshArgs.indexOf(portArg);
|
|
96
|
+
const port = sshArgs[portIndex + 1];
|
|
97
|
+
scpCmd += ` -P ${port}`;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
scpCmd += ` ${localFile} ${userHost}:${remotePath}`;
|
|
101
|
+
|
|
102
|
+
try {
|
|
103
|
+
execSync(scpCmd, { stdio: 'inherit' });
|
|
104
|
+
} catch (e) {
|
|
105
|
+
console.error('SCP failed:', e.message);
|
|
106
|
+
process.exit(1);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async setup(podName, sshCommand) {
|
|
111
|
+
if (!podName || !sshCommand) {
|
|
112
|
+
console.error('Usage: pi setup <pod-name> <ssh_command>');
|
|
113
|
+
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
114
|
+
process.exit(1);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Remove "ssh " prefix if present
|
|
118
|
+
if (sshCommand.toLowerCase().startsWith('ssh ')) {
|
|
119
|
+
sshCommand = sshCommand.substring(4);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Save pod config
|
|
123
|
+
if (!this.config.pods) {
|
|
124
|
+
this.config.pods = {};
|
|
125
|
+
}
|
|
126
|
+
this.config.pods[podName] = { ssh: sshCommand };
|
|
127
|
+
this.config.active = podName;
|
|
128
|
+
this.saveConfig();
|
|
129
|
+
console.log(`Saved pod '${podName}' with SSH: ${sshCommand}`);
|
|
130
|
+
|
|
131
|
+
// Test connection
|
|
132
|
+
console.log('\nTesting SSH connection...');
|
|
133
|
+
try {
|
|
134
|
+
const hostname = this.ssh('hostname', false, true).trim();
|
|
135
|
+
console.log(`✓ Connected to ${hostname}`);
|
|
136
|
+
} catch (e) {
|
|
137
|
+
console.error('✗ SSH connection failed');
|
|
138
|
+
process.exit(1);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Copy setup files
|
|
142
|
+
console.log('\nCopying setup files...');
|
|
143
|
+
this.scp(path.join(SCRIPT_DIR, 'pod_setup.sh'));
|
|
144
|
+
this.scp(path.join(SCRIPT_DIR, 'vllm_manager.py'));
|
|
145
|
+
|
|
146
|
+
// Run setup with HF_TOKEN
|
|
147
|
+
console.log('\nRunning setup script...');
|
|
148
|
+
const hfToken = process.env.HF_TOKEN;
|
|
149
|
+
if (!hfToken) {
|
|
150
|
+
console.error('\nERROR: HF_TOKEN environment variable not set');
|
|
151
|
+
console.error('Please export HF_TOKEN before running setup');
|
|
152
|
+
process.exit(1);
|
|
153
|
+
}
|
|
154
|
+
await this.ssh(`export HF_TOKEN="${hfToken}" && bash pod_setup.sh`, true, true);
|
|
155
|
+
|
|
156
|
+
console.log('\n✓ Setup complete!');
|
|
157
|
+
|
|
158
|
+
// Show usage help
|
|
159
|
+
this.showHelp();
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
list() {
|
|
163
|
+
const output = this.ssh('python3 vllm_manager.py list');
|
|
164
|
+
console.log(output);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
parseContextSize(value) {
|
|
168
|
+
if (!value) return 8192;
|
|
169
|
+
|
|
170
|
+
// Convert string to lowercase for case-insensitive matching
|
|
171
|
+
const lower = value.toString().toLowerCase();
|
|
172
|
+
|
|
173
|
+
// Handle 'k' suffix (4k, 8k, 32k, etc)
|
|
174
|
+
if (lower.endsWith('k')) {
|
|
175
|
+
return parseInt(lower.slice(0, -1)) * 1024;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// Handle plain numbers
|
|
179
|
+
return parseInt(value);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
parseMemory(value) {
|
|
183
|
+
if (!value) return 0.9;
|
|
184
|
+
|
|
185
|
+
const str = value.toString().toLowerCase();
|
|
186
|
+
|
|
187
|
+
// Handle percentage (30%, 50%, etc)
|
|
188
|
+
if (str.endsWith('%')) {
|
|
189
|
+
return parseInt(str.slice(0, -1)) / 100;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// Handle decimal (0.3, 0.5, etc)
|
|
193
|
+
const num = parseFloat(str);
|
|
194
|
+
if (num > 1) {
|
|
195
|
+
console.error('Memory must be between 0-1 or 0-100%');
|
|
196
|
+
process.exit(1);
|
|
197
|
+
}
|
|
198
|
+
return num;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async handleStart(args) {
|
|
202
|
+
if (!args[0]) {
|
|
203
|
+
console.error('Usage: pi start <model> [options]');
|
|
204
|
+
console.error('');
|
|
205
|
+
console.error('Options:');
|
|
206
|
+
console.error(' --name <name> Model alias (default: auto-generated)');
|
|
207
|
+
console.error(' --context <size> Context window: 4k, 8k, 16k, 32k or 4096, 8192, etc (default: model default)');
|
|
208
|
+
console.error(' --memory <percent> GPU memory: 30%, 50%, 90% or 0.3, 0.5, 0.9 (default: 90%)');
|
|
209
|
+
console.error(' --all-gpus Use all GPUs with tensor parallelism (ignores --memory)');
|
|
210
|
+
console.error(' --vllm-args Pass remaining args directly to vLLM (ignores other options)');
|
|
211
|
+
console.error('');
|
|
212
|
+
console.error('Examples:');
|
|
213
|
+
console.error(' pi start Qwen/Qwen2.5-7B-Instruct');
|
|
214
|
+
console.error(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen --memory 20%');
|
|
215
|
+
console.error(' pi start meta-llama/Llama-3.1-70B-Instruct --all-gpus');
|
|
216
|
+
console.error(' pi start meta-llama/Llama-3.1-405B --all-gpus --context 128k');
|
|
217
|
+
console.error('');
|
|
218
|
+
console.error(' # Custom vLLM args for Qwen3-Coder on 8xH200:');
|
|
219
|
+
console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\');
|
|
220
|
+
console.error(' --data-parallel-size 8 --enable-expert-parallel \\');
|
|
221
|
+
console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.9 --max-model-len 200000');
|
|
222
|
+
process.exit(1);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
const modelId = args[0];
|
|
226
|
+
let name = null;
|
|
227
|
+
let context = null; // Changed to null - let vLLM use model default
|
|
228
|
+
let memory = 0.9;
|
|
229
|
+
let allGpus = false;
|
|
230
|
+
let vllmArgs = null;
|
|
231
|
+
|
|
232
|
+
// Check for --vllm-args first
|
|
233
|
+
const vllmArgsIndex = args.indexOf('--vllm-args');
|
|
234
|
+
if (vllmArgsIndex !== -1) {
|
|
235
|
+
// Extract name if provided before --vllm-args
|
|
236
|
+
for (let i = 1; i < vllmArgsIndex; i++) {
|
|
237
|
+
if (args[i] === '--name' && args[i + 1]) {
|
|
238
|
+
name = args[++i];
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
// Everything after --vllm-args is passed to vLLM
|
|
242
|
+
vllmArgs = args.slice(vllmArgsIndex + 1).join(' ');
|
|
243
|
+
} else {
|
|
244
|
+
// Parse normal arguments
|
|
245
|
+
for (let i = 1; i < args.length; i++) {
|
|
246
|
+
switch (args[i]) {
|
|
247
|
+
case '--name':
|
|
248
|
+
name = args[++i];
|
|
249
|
+
break;
|
|
250
|
+
case '--context':
|
|
251
|
+
context = this.parseContextSize(args[++i]);
|
|
252
|
+
break;
|
|
253
|
+
case '--memory':
|
|
254
|
+
memory = this.parseMemory(args[++i]);
|
|
255
|
+
break;
|
|
256
|
+
case '--all-gpus':
|
|
257
|
+
allGpus = true;
|
|
258
|
+
break;
|
|
259
|
+
default:
|
|
260
|
+
console.error(`Unknown option: ${args[i]}`);
|
|
261
|
+
process.exit(1);
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Check for multi-GPU setup
|
|
267
|
+
const gpuCount = await this.getGpuCount();
|
|
268
|
+
|
|
269
|
+
if (allGpus) {
|
|
270
|
+
if (memory !== 0.9) {
|
|
271
|
+
console.log('Warning: --memory ignored with --all-gpus (using 95% memory across all GPUs)');
|
|
272
|
+
}
|
|
273
|
+
memory = 0.95;
|
|
274
|
+
|
|
275
|
+
if (gpuCount === 1) {
|
|
276
|
+
console.log('Note: --all-gpus specified but only 1 GPU found');
|
|
277
|
+
allGpus = false;
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Auto-generate name if not provided
|
|
282
|
+
if (!name) {
|
|
283
|
+
// Extract model name from path (e.g., "Phi-3-mini" from "microsoft/Phi-3-mini-4k-instruct")
|
|
284
|
+
const parts = modelId.split('/');
|
|
285
|
+
const modelName = parts[parts.length - 1];
|
|
286
|
+
name = modelName.toLowerCase()
|
|
287
|
+
.replace(/-instruct$/, '')
|
|
288
|
+
.replace(/-chat$/, '')
|
|
289
|
+
.replace(/[^a-z0-9-]/g, '-')
|
|
290
|
+
.replace(/-+/g, '-')
|
|
291
|
+
.replace(/^-|-$/g, '')
|
|
292
|
+
.slice(0, 20);
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// If vllmArgs provided, use raw vLLM command
|
|
296
|
+
if (vllmArgs) {
|
|
297
|
+
await this.startRaw(modelId, name, vllmArgs);
|
|
298
|
+
} else {
|
|
299
|
+
// Call the original start method with positional args
|
|
300
|
+
const contextStr = context ? context.toString() : null;
|
|
301
|
+
await this.start(modelId, name, contextStr, memory.toString(), { allGpus, gpuCount });
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
async getGpuCount() {
|
|
306
|
+
try {
|
|
307
|
+
const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l');
|
|
308
|
+
return parseInt(output.trim()) || 1;
|
|
309
|
+
} catch {
|
|
310
|
+
return 1;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
async start(modelId, name, maxLen = null, gpuMemory, options = {}) {
|
|
315
|
+
// Check if name is already in use locally first
|
|
316
|
+
if (name) {
|
|
317
|
+
const runningModels = this.getRunningModels();
|
|
318
|
+
if (runningModels[name]) {
|
|
319
|
+
console.error(`Error: Model name '${name}' is already in use`);
|
|
320
|
+
console.error('Running models:', Object.keys(runningModels).join(', '));
|
|
321
|
+
process.exit(1);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
// Build args for vllm_manager.py
|
|
326
|
+
let args = modelId;
|
|
327
|
+
|
|
328
|
+
// Handle optional parameters
|
|
329
|
+
if (name || maxLen || gpuMemory) {
|
|
330
|
+
args += ` ${name || '""'}`;
|
|
331
|
+
|
|
332
|
+
if (maxLen || gpuMemory) {
|
|
333
|
+
args += ` ${maxLen || '""'}`; // Pass empty string to use vLLM default
|
|
334
|
+
|
|
335
|
+
if (gpuMemory) {
|
|
336
|
+
args += ` ${gpuMemory}`;
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
// Handle multi-GPU options
|
|
342
|
+
let envPrefix = '';
|
|
343
|
+
if (options.allGpus && options.gpuCount > 1) {
|
|
344
|
+
args += ` ${options.gpuCount}`; // Pass tensor parallel size
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
const output = this.ssh(`${envPrefix}python3 vllm_manager.py start ${args}`);
|
|
348
|
+
|
|
349
|
+
// Extract model name and connection info from output
|
|
350
|
+
const nameMatch = output.match(/Started (\S+)/);
|
|
351
|
+
const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
|
|
352
|
+
const exportMatch = output.match(/export OPENAI_BASE_URL='([^']+)'/);
|
|
353
|
+
|
|
354
|
+
if (nameMatch) {
|
|
355
|
+
const modelName = nameMatch[1];
|
|
356
|
+
const url = urlMatch ? urlMatch[1] : null;
|
|
357
|
+
const exportCmd = exportMatch ? `export OPENAI_BASE_URL='${exportMatch[1]}'` : null;
|
|
358
|
+
|
|
359
|
+
console.log(`\nStarted ${modelName}`);
|
|
360
|
+
console.log('Waiting for model to initialize...\n');
|
|
361
|
+
|
|
362
|
+
// Set up Ctrl+C handler for manual interruption
|
|
363
|
+
const showModelInfo = () => {
|
|
364
|
+
console.log('\n\n' + '='.repeat(60));
|
|
365
|
+
console.log('Model Information:');
|
|
366
|
+
console.log('='.repeat(60));
|
|
367
|
+
console.log(`Name: ${modelName}`);
|
|
368
|
+
if (url) console.log(`URL: ${url}`);
|
|
369
|
+
if (exportCmd) {
|
|
370
|
+
console.log(`\nTo use with OpenAI clients:`);
|
|
371
|
+
console.log(exportCmd);
|
|
372
|
+
console.log(`export OPENAI_API_KEY='dummy'`);
|
|
373
|
+
}
|
|
374
|
+
console.log('='.repeat(60));
|
|
375
|
+
};
|
|
376
|
+
|
|
377
|
+
process.on('SIGINT', () => {
|
|
378
|
+
showModelInfo();
|
|
379
|
+
process.exit(0);
|
|
380
|
+
});
|
|
381
|
+
|
|
382
|
+
// Watch logs until startup complete
|
|
383
|
+
await this.logs(modelName, true); // autoExit = true for startup
|
|
384
|
+
|
|
385
|
+
// Show model info after automatic exit
|
|
386
|
+
showModelInfo();
|
|
387
|
+
} else {
|
|
388
|
+
console.log(output);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
async startRaw(modelId, name, vllmArgs) {
|
|
393
|
+
// Check if name is already in use
|
|
394
|
+
const runningModels = this.getRunningModels();
|
|
395
|
+
if (runningModels[name]) {
|
|
396
|
+
console.error(`Error: Model name '${name}' is already in use`);
|
|
397
|
+
console.error('Running models:', Object.keys(runningModels).join(', '));
|
|
398
|
+
process.exit(1);
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
console.log(`Starting ${name} with custom vLLM args...`);
|
|
402
|
+
|
|
403
|
+
// Start vLLM with raw arguments - use base64 to safely pass complex args
|
|
404
|
+
const base64Args = Buffer.from(vllmArgs).toString('base64');
|
|
405
|
+
const output = this.ssh(`python3 vllm_manager.py start_raw "${modelId}" "${name}" "${base64Args}"`);
|
|
406
|
+
|
|
407
|
+
// Extract connection info from output
|
|
408
|
+
const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
|
|
409
|
+
const exportMatch = output.match(/export OPENAI_BASE_URL='([^']+)'/);
|
|
410
|
+
|
|
411
|
+
if (urlMatch || exportMatch) {
|
|
412
|
+
const url = urlMatch ? urlMatch[1] : null;
|
|
413
|
+
const exportCmd = exportMatch ? `export OPENAI_BASE_URL='${exportMatch[1]}'` : null;
|
|
414
|
+
|
|
415
|
+
console.log(`\nStarted ${name}`);
|
|
416
|
+
console.log('Waiting for model to initialize...\n');
|
|
417
|
+
|
|
418
|
+
// Set up Ctrl+C handler for manual interruption
|
|
419
|
+
const showModelInfo = () => {
|
|
420
|
+
console.log('\n\n' + '='.repeat(60));
|
|
421
|
+
console.log('Model Information:');
|
|
422
|
+
console.log('='.repeat(60));
|
|
423
|
+
console.log(`Name: ${name}`);
|
|
424
|
+
if (url) console.log(`URL: ${url}`);
|
|
425
|
+
if (exportCmd) {
|
|
426
|
+
console.log(`\nTo use with OpenAI clients:`);
|
|
427
|
+
console.log(exportCmd);
|
|
428
|
+
console.log(`export OPENAI_API_KEY='dummy'`);
|
|
429
|
+
}
|
|
430
|
+
console.log('='.repeat(60));
|
|
431
|
+
};
|
|
432
|
+
|
|
433
|
+
process.on('SIGINT', () => {
|
|
434
|
+
showModelInfo();
|
|
435
|
+
process.exit(0);
|
|
436
|
+
});
|
|
437
|
+
|
|
438
|
+
// Watch logs until startup complete
|
|
439
|
+
await this.logs(name, true); // autoExit = true for startup
|
|
440
|
+
|
|
441
|
+
// Show model info after automatic exit
|
|
442
|
+
showModelInfo();
|
|
443
|
+
} else {
|
|
444
|
+
console.log(output);
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
stop(name) {
|
|
449
|
+
if (!name) {
|
|
450
|
+
// Stop all models
|
|
451
|
+
const runningModels = this.getRunningModels();
|
|
452
|
+
const modelNames = Object.keys(runningModels);
|
|
453
|
+
|
|
454
|
+
if (modelNames.length === 0) {
|
|
455
|
+
console.log('No models running');
|
|
456
|
+
return;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
console.log(`Stopping ${modelNames.length} model(s): ${modelNames.join(', ')}`);
|
|
460
|
+
|
|
461
|
+
for (const modelName of modelNames) {
|
|
462
|
+
const output = this.ssh(`python3 vllm_manager.py stop ${modelName}`);
|
|
463
|
+
console.log(output);
|
|
464
|
+
}
|
|
465
|
+
} else {
|
|
466
|
+
// Stop specific model
|
|
467
|
+
const output = this.ssh(`python3 vllm_manager.py stop ${name}`);
|
|
468
|
+
console.log(output);
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
async logs(name, autoExit = false) {
|
|
473
|
+
if (!name) {
|
|
474
|
+
console.error('Usage: pi logs <name>');
|
|
475
|
+
process.exit(1);
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
// Use vllm_manager.py to get the log file path
|
|
479
|
+
const infoOutput = this.ssh(`python3 vllm_manager.py list`);
|
|
480
|
+
|
|
481
|
+
// Extract log file path from the output
|
|
482
|
+
const lines = infoOutput.split('\n');
|
|
483
|
+
let logFile = null;
|
|
484
|
+
let inModel = false;
|
|
485
|
+
|
|
486
|
+
for (const line of lines) {
|
|
487
|
+
if (line.startsWith(`${name}:`)) {
|
|
488
|
+
inModel = true;
|
|
489
|
+
} else if (inModel && line.includes('Logs:')) {
|
|
490
|
+
logFile = line.split('Logs:')[1].trim();
|
|
491
|
+
break;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
if (!logFile) {
|
|
496
|
+
console.error(`No logs found for ${name}`);
|
|
497
|
+
process.exit(1);
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
// Use a custom tail that watches for startup complete
|
|
501
|
+
const pod = this.getActivePod();
|
|
502
|
+
const sshCmd = `ssh ${pod.ssh} tail -n 50 -f ${logFile}`;
|
|
503
|
+
|
|
504
|
+
return new Promise((resolve) => {
|
|
505
|
+
const [cmd, ...args] = sshCmd.split(' ');
|
|
506
|
+
const proc = spawn(cmd, args, { stdio: ['inherit', 'pipe', 'pipe'] });
|
|
507
|
+
|
|
508
|
+
let buffer = '';
|
|
509
|
+
|
|
510
|
+
proc.stdout.on('data', (data) => {
|
|
511
|
+
process.stdout.write(data);
|
|
512
|
+
buffer += data.toString();
|
|
513
|
+
|
|
514
|
+
// Only check for startup messages if autoExit is enabled
|
|
515
|
+
if (autoExit) {
|
|
516
|
+
if (buffer.includes('Application startup complete.') ||
|
|
517
|
+
buffer.includes('Uvicorn running on')) {
|
|
518
|
+
setTimeout(() => {
|
|
519
|
+
proc.kill();
|
|
520
|
+
resolve();
|
|
521
|
+
}, 500); // Small delay to ensure final messages are shown
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
// Keep buffer size manageable
|
|
526
|
+
if (buffer.length > 10000) {
|
|
527
|
+
buffer = buffer.slice(-5000);
|
|
528
|
+
}
|
|
529
|
+
});
|
|
530
|
+
|
|
531
|
+
proc.stderr.on('data', (data) => {
|
|
532
|
+
process.stderr.write(data);
|
|
533
|
+
});
|
|
534
|
+
|
|
535
|
+
proc.on('close', () => {
|
|
536
|
+
resolve();
|
|
537
|
+
});
|
|
538
|
+
});
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
async shell() {
|
|
542
|
+
const pod = this.getActivePod();
|
|
543
|
+
if (!pod) {
|
|
544
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
545
|
+
process.exit(1);
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
console.log('Connecting to pod...');
|
|
549
|
+
|
|
550
|
+
// Use spawn directly for interactive shell
|
|
551
|
+
const sshParts = pod.ssh.split(' ');
|
|
552
|
+
const sshCmd = ['ssh', ...sshParts];
|
|
553
|
+
const proc = spawn(sshCmd[0], sshCmd.slice(1), { stdio: 'inherit' });
|
|
554
|
+
|
|
555
|
+
return new Promise((resolve) => {
|
|
556
|
+
proc.on('close', resolve);
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
listPods() {
|
|
561
|
+
if (!this.config.pods || Object.keys(this.config.pods).length === 0) {
|
|
562
|
+
console.log('No pods configured. Run: pi setup <pod-name> <ssh_command>');
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
console.log('Configured pods:\n');
|
|
567
|
+
|
|
568
|
+
// Show active pod first
|
|
569
|
+
if (this.config.active && this.config.pods[this.config.active]) {
|
|
570
|
+
console.log(`● ${this.config.active} (active)`);
|
|
571
|
+
console.log(` ${this.config.pods[this.config.active].ssh}\n`);
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
// Show other pods
|
|
575
|
+
Object.keys(this.config.pods).sort().forEach(name => {
|
|
576
|
+
if (name !== this.config.active) {
|
|
577
|
+
console.log(`○ ${name}`);
|
|
578
|
+
console.log(` ${this.config.pods[name].ssh}`);
|
|
579
|
+
}
|
|
580
|
+
});
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
switchPod(podName) {
|
|
584
|
+
if (!this.config.pods || !this.config.pods[podName]) {
|
|
585
|
+
console.error(`Pod '${podName}' not found`);
|
|
586
|
+
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
587
|
+
process.exit(1);
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
this.config.active = podName;
|
|
591
|
+
this.saveConfig();
|
|
592
|
+
console.log(`Switched to pod: ${podName} (${this.config.pods[podName].ssh})`);
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
removePod(podName) {
|
|
596
|
+
if (!this.config.pods || !this.config.pods[podName]) {
|
|
597
|
+
console.error(`Pod '${podName}' not found`);
|
|
598
|
+
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
599
|
+
process.exit(1);
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
delete this.config.pods[podName];
|
|
603
|
+
|
|
604
|
+
// If we removed the active pod, clear it or switch to another
|
|
605
|
+
if (this.config.active === podName) {
|
|
606
|
+
const remainingPods = Object.keys(this.config.pods);
|
|
607
|
+
this.config.active = remainingPods.length > 0 ? remainingPods[0] : null;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
this.saveConfig();
|
|
611
|
+
console.log(`Removed pod: ${podName}`);
|
|
612
|
+
if (this.config.active) {
|
|
613
|
+
console.log(`Active pod is now: ${this.config.active}`);
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
async searchModels(query) {
|
|
618
|
+
console.log(`Searching HuggingFace for models matching "${query}"...\n`);
|
|
619
|
+
|
|
620
|
+
try {
|
|
621
|
+
const response = await fetch(`https://huggingface.co/api/models?search=${query}&filter=text-generation&sort=downloads&limit=20`);
|
|
622
|
+
const data = await response.json();
|
|
623
|
+
|
|
624
|
+
if (!data || data.length === 0) {
|
|
625
|
+
console.log('No models found');
|
|
626
|
+
return;
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
// Format results
|
|
630
|
+
console.log('Popular models (sorted by downloads):\n');
|
|
631
|
+
for (const model of data) {
|
|
632
|
+
const modelName = model.modelId.toLowerCase();
|
|
633
|
+
|
|
634
|
+
// Skip incompatible formats
|
|
635
|
+
if (modelName.includes('-mlx-') || modelName.includes('-mlx')) {
|
|
636
|
+
continue; // MLX is for Apple Silicon only
|
|
637
|
+
}
|
|
638
|
+
if (modelName.includes('-gguf') || modelName.includes('.gguf')) {
|
|
639
|
+
continue; // GGUF is for llama.cpp, not vLLM
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
const downloads = model.downloads || 0;
|
|
643
|
+
const likes = model.likes || 0;
|
|
644
|
+
|
|
645
|
+
console.log(`\x1b[1m${model.modelId}\x1b[0m`); // Bold
|
|
646
|
+
console.log(` \x1b[36mhttps://huggingface.co/${model.modelId}\x1b[0m`); // Cyan for URL
|
|
647
|
+
console.log(` Downloads: ${downloads.toLocaleString()} | Likes: ${likes}`);
|
|
648
|
+
|
|
649
|
+
// Check for quantization
|
|
650
|
+
if (modelName.includes('-fp8') || modelName.includes('fp8-')) {
|
|
651
|
+
console.log(` \x1b[33mNote: FP8 quantized - requires GPU with FP8 support\x1b[0m`);
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
console.log(` pi start ${model.modelId}`);
|
|
655
|
+
console.log();
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Add HuggingFace search URL
|
|
659
|
+
console.log(`\nView more models on HuggingFace:`);
|
|
660
|
+
console.log(`\x1b[36mhttps://huggingface.co/models?search=${encodeURIComponent(query)}&sort=downloads&pipeline_tag=text-generation\x1b[0m`);
|
|
661
|
+
} catch (error) {
|
|
662
|
+
console.error('Error searching models:', error.message);
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
async prompt(name, message) {
|
|
667
|
+
// Get model info
|
|
668
|
+
const models = this.getRunningModels();
|
|
669
|
+
const model = models[name];
|
|
670
|
+
|
|
671
|
+
if (!model || !model.url) {
|
|
672
|
+
console.error(`Model '${name}' is not running`);
|
|
673
|
+
console.error('Running models:', Object.keys(models).join(', ') || 'none');
|
|
674
|
+
process.exit(1);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
// Make API call directly to the model's external URL
|
|
678
|
+
const url = `${model.url}/chat/completions`;
|
|
679
|
+
const payload = {
|
|
680
|
+
model: model.model_id,
|
|
681
|
+
messages: [{ role: 'user', content: message }],
|
|
682
|
+
max_tokens: 500,
|
|
683
|
+
temperature: 0.7
|
|
684
|
+
};
|
|
685
|
+
|
|
686
|
+
try {
|
|
687
|
+
const response = await fetch(url, {
|
|
688
|
+
method: 'POST',
|
|
689
|
+
headers: { 'Content-Type': 'application/json' },
|
|
690
|
+
body: JSON.stringify(payload)
|
|
691
|
+
});
|
|
692
|
+
|
|
693
|
+
if (!response.ok) {
|
|
694
|
+
throw new Error(`HTTP ${response.status}: ${await response.text()}`);
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
const data = await response.json();
|
|
698
|
+
console.log(data.choices[0].message.content);
|
|
699
|
+
} catch (error) {
|
|
700
|
+
console.error('Error:', error.message);
|
|
701
|
+
process.exit(1);
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
showHelp() {
|
|
706
|
+
console.log('\nPrime Intellect CLI\n');
|
|
707
|
+
|
|
708
|
+
console.log('Pod Management:');
|
|
709
|
+
console.log(' pi setup <pod-name> <ssh_command> Configure and activate a pod');
|
|
710
|
+
console.log(' pi pods List all pods (active pod marked)');
|
|
711
|
+
console.log(' pi pod <pod-name> Switch active pod');
|
|
712
|
+
console.log(' pi pod remove <pod-name> Remove pod from config\n');
|
|
713
|
+
console.log('Model Management:');
|
|
714
|
+
console.log(' pi list List running models');
|
|
715
|
+
console.log(' pi search <query> Search HuggingFace models');
|
|
716
|
+
console.log(' pi start <model> [options] Start a model');
|
|
717
|
+
console.log(' pi stop [name] Stop a model (or all if no name)');
|
|
718
|
+
console.log(' pi logs <name> View model logs');
|
|
719
|
+
console.log(' pi prompt <name> <msg> Chat with a model\n');
|
|
720
|
+
console.log('Start Options:');
|
|
721
|
+
console.log(' --name <name> Model alias (default: auto-generated)');
|
|
722
|
+
console.log(' --context <size> Context window: 4k, 16k, 32k (default: 8k)');
|
|
723
|
+
console.log(' --memory <percent> GPU memory: 30%, 50%, 90% (default: 90%)');
|
|
724
|
+
console.log(' --all-gpus Use all GPUs with tensor parallelism\n');
|
|
725
|
+
console.log('Utility:');
|
|
726
|
+
console.log(' pi shell SSH into active pod');
|
|
727
|
+
|
|
728
|
+
console.log('\nQuick Example:');
|
|
729
|
+
console.log(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen');
|
|
730
|
+
console.log(' pi prompt qwen "What is 2+2?"');
|
|
731
|
+
|
|
732
|
+
if (this.config.active && this.config.pods[this.config.active]) {
|
|
733
|
+
console.log(`\nActive pod: ${this.config.active} (${this.config.pods[this.config.active].ssh})`);
|
|
734
|
+
} else {
|
|
735
|
+
console.log('\nNo active pod');
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
getRunningModels() {
|
|
740
|
+
try {
|
|
741
|
+
const output = this.ssh('python3 vllm_manager.py list');
|
|
742
|
+
const models = {};
|
|
743
|
+
|
|
744
|
+
// Parse the output to extract model info
|
|
745
|
+
const lines = output.split('\n');
|
|
746
|
+
let currentModel = null;
|
|
747
|
+
|
|
748
|
+
for (const line of lines) {
|
|
749
|
+
if (line.match(/^[a-zA-Z0-9_-]+:$/)) {
|
|
750
|
+
currentModel = line.slice(0, -1);
|
|
751
|
+
models[currentModel] = {};
|
|
752
|
+
} else if (currentModel) {
|
|
753
|
+
if (line.includes('Model:')) {
|
|
754
|
+
models[currentModel].model_id = line.split('Model:')[1].trim();
|
|
755
|
+
} else if (line.includes('Port:')) {
|
|
756
|
+
models[currentModel].port = parseInt(line.split('Port:')[1].trim());
|
|
757
|
+
} else if (line.includes('URL:')) {
|
|
758
|
+
models[currentModel].url = line.split('URL:')[1].trim();
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
return models;
|
|
764
|
+
} catch (e) {
|
|
765
|
+
return {};
|
|
766
|
+
}
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
async run() {
|
|
770
|
+
const [,, command, ...args] = process.argv;
|
|
771
|
+
|
|
772
|
+
switch (command) {
|
|
773
|
+
case 'setup': {
|
|
774
|
+
if (args.length < 2) {
|
|
775
|
+
console.error('Usage: pi setup <pod-name> <ssh_command>');
|
|
776
|
+
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
777
|
+
process.exit(1);
|
|
778
|
+
}
|
|
779
|
+
const podName = args[0];
|
|
780
|
+
const sshCmd = args.slice(1).join(' ');
|
|
781
|
+
this.setup(podName, sshCmd);
|
|
782
|
+
break;
|
|
783
|
+
}
|
|
784
|
+
case 'pods':
|
|
785
|
+
this.listPods();
|
|
786
|
+
break;
|
|
787
|
+
|
|
788
|
+
case 'pod':
|
|
789
|
+
if (!args[0]) {
|
|
790
|
+
console.error('Usage: pi pod <pod-name>');
|
|
791
|
+
console.error(' pi pod remove <pod-name>');
|
|
792
|
+
process.exit(1);
|
|
793
|
+
}
|
|
794
|
+
if (args[0] === 'remove' && args[1]) {
|
|
795
|
+
this.removePod(args[1]);
|
|
796
|
+
} else {
|
|
797
|
+
this.switchPod(args[0]);
|
|
798
|
+
}
|
|
799
|
+
break;
|
|
800
|
+
|
|
801
|
+
case 'list':
|
|
802
|
+
case 'ls':
|
|
803
|
+
this.list();
|
|
804
|
+
break;
|
|
805
|
+
|
|
806
|
+
case 'search':
|
|
807
|
+
if (!args[0]) {
|
|
808
|
+
console.error('Usage: pi search <query>');
|
|
809
|
+
console.error('Example: pi search qwen');
|
|
810
|
+
process.exit(1);
|
|
811
|
+
}
|
|
812
|
+
await this.searchModels(args[0]);
|
|
813
|
+
break;
|
|
814
|
+
|
|
815
|
+
case 'start':
|
|
816
|
+
await this.handleStart(args);
|
|
817
|
+
break;
|
|
818
|
+
|
|
819
|
+
case 'stop':
|
|
820
|
+
this.stop(args[0]);
|
|
821
|
+
break;
|
|
822
|
+
|
|
823
|
+
case 'logs':
|
|
824
|
+
await this.logs(args[0], false); // autoExit = false for manual logs command
|
|
825
|
+
break;
|
|
826
|
+
|
|
827
|
+
case 'prompt': {
|
|
828
|
+
if (args.length < 2) {
|
|
829
|
+
console.error('Usage: pi prompt <model_name> "<message>"');
|
|
830
|
+
console.error('Example: pi prompt phi3 "Hey, how you going"');
|
|
831
|
+
process.exit(1);
|
|
832
|
+
}
|
|
833
|
+
const modelName = args[0];
|
|
834
|
+
const message = args.slice(1).join(' ');
|
|
835
|
+
this.prompt(modelName, message);
|
|
836
|
+
break;
|
|
837
|
+
}
|
|
838
|
+
case 'shell':
|
|
839
|
+
await this.shell();
|
|
840
|
+
break;
|
|
841
|
+
|
|
842
|
+
case 'ssh':
|
|
843
|
+
// Pass through any SSH command
|
|
844
|
+
if (args.length > 0) {
|
|
845
|
+
const output = this.ssh(args.join(' '));
|
|
846
|
+
console.log(output);
|
|
847
|
+
} else {
|
|
848
|
+
this.shell();
|
|
849
|
+
}
|
|
850
|
+
break;
|
|
851
|
+
|
|
852
|
+
default:
|
|
853
|
+
this.showHelp();
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
// Run CLI
|
|
859
|
+
const cli = new PrimeIntellectCLI();
|
|
860
|
+
cli.run().catch(console.error);
|