@mariozechner/pi 0.1.5 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -2
- package/package.json +1 -1
- package/pi.js +576 -75
- package/pod_setup.sh +55 -114
- package/vllm_manager.py +167 -4
package/pi.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
/**
|
|
3
|
-
*
|
|
3
|
+
* pi CLI
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
const fs = require('fs');
|
|
@@ -11,7 +11,7 @@ const os = require('os');
|
|
|
11
11
|
const CONFIG_FILE = path.join(os.homedir(), '.pi_config');
|
|
12
12
|
const SCRIPT_DIR = __dirname;
|
|
13
13
|
|
|
14
|
-
class
|
|
14
|
+
class PiCli {
|
|
15
15
|
constructor() {
|
|
16
16
|
this.loadConfig();
|
|
17
17
|
}
|
|
@@ -43,12 +43,17 @@ class PrimeIntellectCLI {
|
|
|
43
43
|
return this.config.pods[this.config.active];
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
ssh(command, interactive = false, skipPirc = false) {
|
|
47
|
-
const pod = this.getActivePod();
|
|
46
|
+
ssh(command, interactive = false, skipPirc = false, podName = null) {
|
|
47
|
+
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
48
48
|
if (!pod) {
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
if (podName) {
|
|
50
|
+
console.error(`Pod '${podName}' not found`);
|
|
51
|
+
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
52
|
+
} else {
|
|
53
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
54
|
+
console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
|
|
55
|
+
console.error('Or activate an existing pod: pi pod <pod-name>');
|
|
56
|
+
}
|
|
52
57
|
process.exit(1);
|
|
53
58
|
}
|
|
54
59
|
|
|
@@ -79,10 +84,14 @@ class PrimeIntellectCLI {
|
|
|
79
84
|
}
|
|
80
85
|
}
|
|
81
86
|
|
|
82
|
-
scp(localFile, remotePath = '~/') {
|
|
83
|
-
const pod = this.getActivePod();
|
|
87
|
+
scp(localFile, remotePath = '~/', podName = null) {
|
|
88
|
+
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
84
89
|
if (!pod) {
|
|
85
|
-
|
|
90
|
+
if (podName) {
|
|
91
|
+
console.error(`Pod '${podName}' not found`);
|
|
92
|
+
} else {
|
|
93
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
94
|
+
}
|
|
86
95
|
process.exit(1);
|
|
87
96
|
}
|
|
88
97
|
|
|
@@ -159,8 +168,8 @@ class PrimeIntellectCLI {
|
|
|
159
168
|
this.showHelp();
|
|
160
169
|
}
|
|
161
170
|
|
|
162
|
-
list() {
|
|
163
|
-
const output = this.ssh('python3 vllm_manager.py list');
|
|
171
|
+
list(podName = null) {
|
|
172
|
+
const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
|
|
164
173
|
console.log(output);
|
|
165
174
|
}
|
|
166
175
|
|
|
@@ -207,6 +216,8 @@ class PrimeIntellectCLI {
|
|
|
207
216
|
console.error(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k or 4096, 8192, etc (default: model default)');
|
|
208
217
|
console.error(' --memory <percent> GPU memory: 30%, 50%, 90% or 0.3, 0.5, 0.9 (default: 90%)');
|
|
209
218
|
console.error(' --all-gpus Use all GPUs with tensor parallelism (ignores --memory)');
|
|
219
|
+
console.error(' --debug Enable debug logging for vLLM');
|
|
220
|
+
console.error(' --pod <name> Run on specific pod (default: active pod)');
|
|
210
221
|
console.error(' --vllm-args Pass remaining args directly to vLLM (ignores other options)');
|
|
211
222
|
console.error('');
|
|
212
223
|
console.error('Examples:');
|
|
@@ -216,9 +227,9 @@ class PrimeIntellectCLI {
|
|
|
216
227
|
console.error(' pi start meta-llama/Llama-3.1-405B --all-gpus --context 128k');
|
|
217
228
|
console.error('');
|
|
218
229
|
console.error(' # Custom vLLM args for Qwen3-Coder on 8xH200:');
|
|
219
|
-
console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args
|
|
220
|
-
console.error(' --data-parallel-size 8 --enable-expert-parallel
|
|
221
|
-
console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.
|
|
230
|
+
console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\\\');
|
|
231
|
+
console.error(' --data-parallel-size 8 --enable-expert-parallel \\\\');
|
|
232
|
+
console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
|
|
222
233
|
process.exit(1);
|
|
223
234
|
}
|
|
224
235
|
|
|
@@ -227,15 +238,21 @@ class PrimeIntellectCLI {
|
|
|
227
238
|
let context = null; // Changed to null - let vLLM use model default
|
|
228
239
|
let memory = 0.9;
|
|
229
240
|
let allGpus = false;
|
|
241
|
+
let debug = false;
|
|
230
242
|
let vllmArgs = null;
|
|
243
|
+
let podName = null;
|
|
231
244
|
|
|
232
245
|
// Check for --vllm-args first
|
|
233
246
|
const vllmArgsIndex = args.indexOf('--vllm-args');
|
|
234
247
|
if (vllmArgsIndex !== -1) {
|
|
235
|
-
// Extract name if provided before --vllm-args
|
|
248
|
+
// Extract name and pod if provided before --vllm-args
|
|
236
249
|
for (let i = 1; i < vllmArgsIndex; i++) {
|
|
237
250
|
if (args[i] === '--name' && args[i + 1]) {
|
|
238
251
|
name = args[++i];
|
|
252
|
+
} else if (args[i] === '--pod' && args[i + 1]) {
|
|
253
|
+
podName = args[++i];
|
|
254
|
+
} else if (args[i] === '--debug') {
|
|
255
|
+
debug = true;
|
|
239
256
|
}
|
|
240
257
|
}
|
|
241
258
|
// Everything after --vllm-args is passed to vLLM
|
|
@@ -256,6 +273,12 @@ class PrimeIntellectCLI {
|
|
|
256
273
|
case '--all-gpus':
|
|
257
274
|
allGpus = true;
|
|
258
275
|
break;
|
|
276
|
+
case '--debug':
|
|
277
|
+
debug = true;
|
|
278
|
+
break;
|
|
279
|
+
case '--pod':
|
|
280
|
+
podName = args[++i];
|
|
281
|
+
break;
|
|
259
282
|
default:
|
|
260
283
|
console.error(`Unknown option: ${args[i]}`);
|
|
261
284
|
process.exit(1);
|
|
@@ -264,7 +287,7 @@ class PrimeIntellectCLI {
|
|
|
264
287
|
}
|
|
265
288
|
|
|
266
289
|
// Check for multi-GPU setup
|
|
267
|
-
const gpuCount = await this.getGpuCount();
|
|
290
|
+
const gpuCount = await this.getGpuCount(podName);
|
|
268
291
|
|
|
269
292
|
if (allGpus) {
|
|
270
293
|
if (memory !== 0.9) {
|
|
@@ -292,29 +315,170 @@ class PrimeIntellectCLI {
|
|
|
292
315
|
.slice(0, 20);
|
|
293
316
|
}
|
|
294
317
|
|
|
295
|
-
// If vllmArgs provided,
|
|
318
|
+
// If vllmArgs provided, skip memory check since we don't know the parallelism strategy
|
|
296
319
|
if (vllmArgs) {
|
|
297
|
-
await this.
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
320
|
+
const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
|
|
321
|
+
if (modelEstimate) {
|
|
322
|
+
console.log(`Model weights: ${modelEstimate.modelSizeGB.toFixed(1)}GB`);
|
|
323
|
+
console.log(`Context length: ${modelEstimate.contextLength.toLocaleString()} tokens`);
|
|
324
|
+
}
|
|
325
|
+
console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
|
|
326
|
+
await this.startRaw(modelId, name, vllmArgs, debug, podName);
|
|
327
|
+
return;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// For standard deployment, check memory
|
|
331
|
+
const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
|
|
332
|
+
|
|
333
|
+
// Check GPU memory before starting
|
|
334
|
+
console.log('Checking model size and GPU memory...');
|
|
335
|
+
console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
|
|
336
|
+
const [memoryInfo, modelEstimateWithContext] = await Promise.all([
|
|
337
|
+
this.getGpuMemoryInfo(podName),
|
|
338
|
+
modelEstimate
|
|
339
|
+
]);
|
|
340
|
+
|
|
341
|
+
if (memoryInfo && modelEstimateWithContext) {
|
|
342
|
+
// For tensor parallel (--all-gpus), memory is distributed across GPUs
|
|
343
|
+
const effectiveMemoryNeeded = allGpus && gpuCount > 1
|
|
344
|
+
? modelEstimateWithContext.estimatedMemoryGB / gpuCount
|
|
345
|
+
: modelEstimateWithContext.estimatedMemoryGB;
|
|
346
|
+
|
|
347
|
+
const memoryPerGpu = memoryInfo.freeMemoryGB / (gpuCount || 1);
|
|
348
|
+
|
|
349
|
+
console.log(`Model weights: ${modelEstimateWithContext.modelSizeGB.toFixed(1)}GB`);
|
|
350
|
+
console.log(`Context length: ${modelEstimateWithContext.contextLength.toLocaleString()} tokens`);
|
|
351
|
+
console.log(`Note: Estimate includes model parameters only, not KV cache for context`);
|
|
352
|
+
console.log(`Available GPU memory: ${memoryInfo.freeMemoryGB.toFixed(1)}GB total (${memoryPerGpu.toFixed(1)}GB per GPU)`);
|
|
353
|
+
|
|
354
|
+
if (effectiveMemoryNeeded > memoryPerGpu) {
|
|
355
|
+
// Log a BIG WARNING as requested
|
|
356
|
+
console.error(`\n❌ BIG WARNING: Insufficient GPU memory`);
|
|
357
|
+
if (allGpus && gpuCount > 1) {
|
|
358
|
+
console.error(` Model needs ~${effectiveMemoryNeeded.toFixed(1)}GB per GPU but only ${memoryPerGpu.toFixed(1)}GB available`);
|
|
359
|
+
} else {
|
|
360
|
+
console.error(` Model needs ~${modelEstimateWithContext.estimatedMemoryGB.toFixed(1)}GB but only ${memoryInfo.freeMemoryGB.toFixed(1)}GB available`);
|
|
361
|
+
}
|
|
362
|
+
console.error('\n Free up memory by stopping running models:');
|
|
363
|
+
console.error(' pi list # See running models');
|
|
364
|
+
console.error(' pi stop <model_name> # Stop specific model');
|
|
365
|
+
console.error(' pi stop # Stop all models\n');
|
|
366
|
+
// Don't exit, just warn and proceed
|
|
367
|
+
}
|
|
302
368
|
}
|
|
369
|
+
|
|
370
|
+
// Call the original start method with positional args
|
|
371
|
+
const contextStr = context ? context.toString() : null;
|
|
372
|
+
await this.start(modelId, name, contextStr, memory.toString(), { allGpus, gpuCount, debug, podName });
|
|
303
373
|
}
|
|
304
374
|
|
|
305
|
-
async getGpuCount() {
|
|
375
|
+
async getGpuCount(podName = null) {
|
|
306
376
|
try {
|
|
307
|
-
const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l');
|
|
377
|
+
const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l', false, false, podName);
|
|
308
378
|
return parseInt(output.trim()) || 1;
|
|
309
379
|
} catch {
|
|
310
380
|
return 1;
|
|
311
381
|
}
|
|
312
382
|
}
|
|
313
383
|
|
|
384
|
+
async getGpuMemoryInfo(podName = null) {
|
|
385
|
+
try {
|
|
386
|
+
const output = this.ssh('nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits', false, false, podName);
|
|
387
|
+
const lines = output.trim().split('\n');
|
|
388
|
+
let totalMemoryGB = 0;
|
|
389
|
+
let freeMemoryGB = 0;
|
|
390
|
+
|
|
391
|
+
for (const line of lines) {
|
|
392
|
+
const [total, free] = line.split(',').map(x => parseInt(x.trim()));
|
|
393
|
+
totalMemoryGB += total / 1024;
|
|
394
|
+
freeMemoryGB += free / 1024;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
return { totalMemoryGB, freeMemoryGB };
|
|
398
|
+
} catch (e) {
|
|
399
|
+
return null;
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
async getModelMemoryEstimate(modelId, contextLength = null) {
|
|
404
|
+
try {
|
|
405
|
+
const response = await fetch(`https://huggingface.co/api/models/${modelId}`);
|
|
406
|
+
const data = await response.json();
|
|
407
|
+
|
|
408
|
+
if (data.safetensors?.parameters) {
|
|
409
|
+
// Calculate actual model size based on parameter counts and types
|
|
410
|
+
const dtypeSizes = {
|
|
411
|
+
'F64': 8, // float64 - 8 bytes
|
|
412
|
+
'F32': 4, // float32 - 4 bytes
|
|
413
|
+
'BF16': 2, // bfloat16 - 2 bytes
|
|
414
|
+
'F16': 2, // float16 - 2 bytes
|
|
415
|
+
'I32': 4, // int32 - 4 bytes
|
|
416
|
+
'I16': 2, // int16 - 2 bytes
|
|
417
|
+
'I8': 1, // int8 - 1 byte
|
|
418
|
+
'U8': 1, // uint8 - 1 byte
|
|
419
|
+
'I4': 0.5, // int4 - 0.5 bytes (packed)
|
|
420
|
+
'F8_E4M3': 1, // FP8 E4M3 format - 1 byte
|
|
421
|
+
'F8_E5M2': 1, // FP8 E5M2 format - 1 byte
|
|
422
|
+
'Q8_0': 1, // GGML quantization formats
|
|
423
|
+
'Q4_0': 0.5, // GGML quantization formats
|
|
424
|
+
'Q4_1': 0.5, // GGML quantization formats
|
|
425
|
+
'Q5_0': 0.625, // GGML quantization formats
|
|
426
|
+
'Q5_1': 0.625 // GGML quantization formats
|
|
427
|
+
};
|
|
428
|
+
|
|
429
|
+
let totalBytes = 0;
|
|
430
|
+
let paramDetails = [];
|
|
431
|
+
|
|
432
|
+
// Calculate bytes for each dtype
|
|
433
|
+
let unknownDtypes = [];
|
|
434
|
+
for (const [dtype, paramCount] of Object.entries(data.safetensors.parameters)) {
|
|
435
|
+
let bytesPerParam = dtypeSizes[dtype];
|
|
436
|
+
if (bytesPerParam === undefined) {
|
|
437
|
+
// Unknown dtype - assume 1 byte (most new formats are quantized)
|
|
438
|
+
bytesPerParam = 1; // Conservative for memory checking
|
|
439
|
+
unknownDtypes.push(dtype);
|
|
440
|
+
}
|
|
441
|
+
const bytes = paramCount * bytesPerParam;
|
|
442
|
+
totalBytes += bytes;
|
|
443
|
+
paramDetails.push({ dtype, count: paramCount, bytes });
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
if (unknownDtypes.length > 0) {
|
|
447
|
+
console.warn(`Unknown dtype(s) found: ${unknownDtypes.join(', ')}. Assuming 1 byte per parameter.`);
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
const modelSizeGB = totalBytes / (1024 ** 3);
|
|
451
|
+
|
|
452
|
+
// Try to get model config for context length
|
|
453
|
+
let maxContextLength = contextLength;
|
|
454
|
+
try {
|
|
455
|
+
const configResponse = await fetch(`https://huggingface.co/${modelId}/raw/main/config.json`);
|
|
456
|
+
if (configResponse.ok) {
|
|
457
|
+
const config = await configResponse.json();
|
|
458
|
+
maxContextLength = contextLength || config.max_position_embeddings || 8192;
|
|
459
|
+
}
|
|
460
|
+
} catch (e) {
|
|
461
|
+
maxContextLength = contextLength || 8192;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
return {
|
|
465
|
+
modelSizeGB,
|
|
466
|
+
estimatedMemoryGB: modelSizeGB, // Only model weights, not KV cache
|
|
467
|
+
contextLength: maxContextLength,
|
|
468
|
+
paramDetails // For debugging
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
return null;
|
|
473
|
+
} catch (e) {
|
|
474
|
+
return null;
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
|
|
314
478
|
async start(modelId, name, maxLen = null, gpuMemory, options = {}) {
|
|
315
479
|
// Check if name is already in use locally first
|
|
316
480
|
if (name) {
|
|
317
|
-
const runningModels = this.getRunningModels();
|
|
481
|
+
const runningModels = this.getRunningModels(options.podName);
|
|
318
482
|
if (runningModels[name]) {
|
|
319
483
|
console.error(`Error: Model name '${name}' is already in use`);
|
|
320
484
|
console.error('Running models:', Object.keys(runningModels).join(', '));
|
|
@@ -322,6 +486,8 @@ class PrimeIntellectCLI {
|
|
|
322
486
|
}
|
|
323
487
|
}
|
|
324
488
|
|
|
489
|
+
// Memory check is already done in handleStart, skip it here
|
|
490
|
+
|
|
325
491
|
// Build args for vllm_manager.py
|
|
326
492
|
let args = modelId;
|
|
327
493
|
|
|
@@ -344,7 +510,12 @@ class PrimeIntellectCLI {
|
|
|
344
510
|
args += ` ${options.gpuCount}`; // Pass tensor parallel size
|
|
345
511
|
}
|
|
346
512
|
|
|
347
|
-
|
|
513
|
+
// Add debug logging if requested
|
|
514
|
+
if (options.debug) {
|
|
515
|
+
envPrefix = 'VLLM_LOGGING_LEVEL=DEBUG ';
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
const output = this.ssh(`${envPrefix}python3 vllm_manager.py start ${args}`, false, false, options.podName);
|
|
348
519
|
|
|
349
520
|
// Extract model name and connection info from output
|
|
350
521
|
const nameMatch = output.match(/Started (\S+)/);
|
|
@@ -370,6 +541,7 @@ class PrimeIntellectCLI {
|
|
|
370
541
|
console.log(`\nTo use with OpenAI clients:`);
|
|
371
542
|
console.log(exportCmd);
|
|
372
543
|
console.log(`export OPENAI_API_KEY='dummy'`);
|
|
544
|
+
console.log(`export OPENAI_MODEL='${modelId}'`);
|
|
373
545
|
}
|
|
374
546
|
console.log('='.repeat(60));
|
|
375
547
|
};
|
|
@@ -380,29 +552,58 @@ class PrimeIntellectCLI {
|
|
|
380
552
|
});
|
|
381
553
|
|
|
382
554
|
// Watch logs until startup complete
|
|
383
|
-
await this.logs(modelName, true); // autoExit = true for startup
|
|
555
|
+
await this.logs(modelName, true, options.podName); // autoExit = true for startup
|
|
384
556
|
|
|
385
|
-
//
|
|
557
|
+
// Warm up the model with a simple prompt
|
|
558
|
+
console.log('\nWarming up model...');
|
|
559
|
+
try {
|
|
560
|
+
const warmupUrl = `${url}/chat/completions`;
|
|
561
|
+
const warmupPayload = {
|
|
562
|
+
model: modelId,
|
|
563
|
+
messages: [{ role: 'user', content: 'Hi' }],
|
|
564
|
+
max_tokens: 1,
|
|
565
|
+
temperature: 0
|
|
566
|
+
};
|
|
567
|
+
|
|
568
|
+
const warmupResponse = await fetch(warmupUrl, {
|
|
569
|
+
method: 'POST',
|
|
570
|
+
headers: { 'Content-Type': 'application/json' },
|
|
571
|
+
body: JSON.stringify(warmupPayload)
|
|
572
|
+
});
|
|
573
|
+
|
|
574
|
+
if (warmupResponse.ok) {
|
|
575
|
+
console.log('✓ Model warmed up and ready!');
|
|
576
|
+
} else {
|
|
577
|
+
console.log('⚠ Warmup failed, but model should still work');
|
|
578
|
+
}
|
|
579
|
+
} catch (e) {
|
|
580
|
+
console.log('⚠ Could not warm up model:', e.message);
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
// Show model info after warmup
|
|
386
584
|
showModelInfo();
|
|
387
585
|
} else {
|
|
388
586
|
console.log(output);
|
|
389
587
|
}
|
|
390
588
|
}
|
|
391
589
|
|
|
392
|
-
async startRaw(modelId, name, vllmArgs) {
|
|
590
|
+
async startRaw(modelId, name, vllmArgs, debug = false, podName = null) {
|
|
591
|
+
// Skip memory check for raw vLLM args since we don't know what custom settings are used
|
|
592
|
+
console.log('Note: Memory checking disabled when using --vllm-args');
|
|
393
593
|
// Check if name is already in use
|
|
394
|
-
const runningModels = this.getRunningModels();
|
|
594
|
+
const runningModels = this.getRunningModels(podName);
|
|
395
595
|
if (runningModels[name]) {
|
|
396
596
|
console.error(`Error: Model name '${name}' is already in use`);
|
|
397
597
|
console.error('Running models:', Object.keys(runningModels).join(', '));
|
|
398
598
|
process.exit(1);
|
|
399
599
|
}
|
|
400
600
|
|
|
401
|
-
console.log(`Starting ${name} with custom vLLM args
|
|
601
|
+
console.log(`Starting ${name} with custom vLLM args on pod: ${podName || this.config.active || 'active pod'}`);
|
|
402
602
|
|
|
403
603
|
// Start vLLM with raw arguments - use base64 to safely pass complex args
|
|
404
604
|
const base64Args = Buffer.from(vllmArgs).toString('base64');
|
|
405
|
-
const
|
|
605
|
+
const envPrefix = debug ? 'VLLM_LOGGING_LEVEL=DEBUG ' : '';
|
|
606
|
+
const output = this.ssh(`${envPrefix}python3 vllm_manager.py start_raw "${modelId}" "${name}" "${base64Args}"`, false, false, podName);
|
|
406
607
|
|
|
407
608
|
// Extract connection info from output
|
|
408
609
|
const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
|
|
@@ -426,6 +627,7 @@ class PrimeIntellectCLI {
|
|
|
426
627
|
console.log(`\nTo use with OpenAI clients:`);
|
|
427
628
|
console.log(exportCmd);
|
|
428
629
|
console.log(`export OPENAI_API_KEY='dummy'`);
|
|
630
|
+
console.log(`export OPENAI_MODEL='${modelId}'`);
|
|
429
631
|
}
|
|
430
632
|
console.log('='.repeat(60));
|
|
431
633
|
};
|
|
@@ -436,47 +638,80 @@ class PrimeIntellectCLI {
|
|
|
436
638
|
});
|
|
437
639
|
|
|
438
640
|
// Watch logs until startup complete
|
|
439
|
-
await this.logs(name, true); // autoExit = true for startup
|
|
641
|
+
await this.logs(name, true, podName); // autoExit = true for startup
|
|
642
|
+
|
|
643
|
+
// Warm up the model with a simple prompt
|
|
644
|
+
console.log('\nWarming up model...');
|
|
645
|
+
try {
|
|
646
|
+
const warmupUrl = `${url}/chat/completions`;
|
|
647
|
+
const warmupPayload = {
|
|
648
|
+
model: modelId,
|
|
649
|
+
messages: [{ role: 'user', content: 'Hi' }],
|
|
650
|
+
max_tokens: 1,
|
|
651
|
+
temperature: 0
|
|
652
|
+
};
|
|
653
|
+
|
|
654
|
+
const warmupResponse = await fetch(warmupUrl, {
|
|
655
|
+
method: 'POST',
|
|
656
|
+
headers: { 'Content-Type': 'application/json' },
|
|
657
|
+
body: JSON.stringify(warmupPayload)
|
|
658
|
+
});
|
|
440
659
|
|
|
441
|
-
|
|
660
|
+
if (warmupResponse.ok) {
|
|
661
|
+
console.log('✓ Model warmed up and ready!');
|
|
662
|
+
} else {
|
|
663
|
+
console.log('⚠ Warmup failed, but model should still work');
|
|
664
|
+
}
|
|
665
|
+
} catch (e) {
|
|
666
|
+
console.log('⚠ Could not warm up model:', e.message);
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
// Show model info after warmup
|
|
442
670
|
showModelInfo();
|
|
443
671
|
} else {
|
|
444
672
|
console.log(output);
|
|
445
673
|
}
|
|
446
674
|
}
|
|
447
675
|
|
|
448
|
-
stop(name) {
|
|
676
|
+
stop(name, podName = null) {
|
|
449
677
|
if (!name) {
|
|
450
678
|
// Stop all models
|
|
451
|
-
const runningModels = this.getRunningModels();
|
|
679
|
+
const runningModels = this.getRunningModels(podName);
|
|
452
680
|
const modelNames = Object.keys(runningModels);
|
|
453
681
|
|
|
454
682
|
if (modelNames.length === 0) {
|
|
455
683
|
console.log('No models running');
|
|
684
|
+
// Still clean up any hanging vLLM processes
|
|
685
|
+
console.log('Cleaning up any remaining vLLM processes...');
|
|
686
|
+
this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
|
|
456
687
|
return;
|
|
457
688
|
}
|
|
458
689
|
|
|
459
690
|
console.log(`Stopping ${modelNames.length} model(s): ${modelNames.join(', ')}`);
|
|
460
691
|
|
|
461
692
|
for (const modelName of modelNames) {
|
|
462
|
-
const output = this.ssh(`python3 vllm_manager.py stop ${modelName}
|
|
693
|
+
const output = this.ssh(`python3 vllm_manager.py stop ${modelName}`, false, false, podName);
|
|
463
694
|
console.log(output);
|
|
464
695
|
}
|
|
696
|
+
|
|
697
|
+
// Final cleanup of vLLM processes after stopping all models
|
|
698
|
+
console.log('Ensuring all vLLM processes are terminated...');
|
|
699
|
+
this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
|
|
465
700
|
} else {
|
|
466
701
|
// Stop specific model
|
|
467
|
-
const output = this.ssh(`python3 vllm_manager.py stop ${name}
|
|
702
|
+
const output = this.ssh(`python3 vllm_manager.py stop ${name}`, false, false, podName);
|
|
468
703
|
console.log(output);
|
|
469
704
|
}
|
|
470
705
|
}
|
|
471
706
|
|
|
472
|
-
async logs(name, autoExit = false) {
|
|
707
|
+
async logs(name, autoExit = false, podName = null) {
|
|
473
708
|
if (!name) {
|
|
474
709
|
console.error('Usage: pi logs <name>');
|
|
475
710
|
process.exit(1);
|
|
476
711
|
}
|
|
477
712
|
|
|
478
713
|
// Use vllm_manager.py to get the log file path
|
|
479
|
-
const infoOutput = this.ssh(`python3 vllm_manager.py list
|
|
714
|
+
const infoOutput = this.ssh(`python3 vllm_manager.py list`, false, false, podName);
|
|
480
715
|
|
|
481
716
|
// Extract log file path from the output
|
|
482
717
|
const lines = infoOutput.split('\n');
|
|
@@ -498,8 +733,10 @@ class PrimeIntellectCLI {
|
|
|
498
733
|
}
|
|
499
734
|
|
|
500
735
|
// Use a custom tail that watches for startup complete
|
|
501
|
-
const pod = this.getActivePod();
|
|
502
|
-
|
|
736
|
+
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
737
|
+
// Add SSH options to prevent connection issues
|
|
738
|
+
const sshOpts = '-o ServerAliveInterval=5 -o ServerAliveCountMax=3 -o TCPKeepAlive=yes';
|
|
739
|
+
const sshCmd = `ssh ${sshOpts} ${pod.ssh} tail -n 50 -f ${logFile}`;
|
|
503
740
|
|
|
504
741
|
return new Promise((resolve) => {
|
|
505
742
|
const [cmd, ...args] = sshCmd.split(' ');
|
|
@@ -538,14 +775,19 @@ class PrimeIntellectCLI {
|
|
|
538
775
|
});
|
|
539
776
|
}
|
|
540
777
|
|
|
541
|
-
async shell() {
|
|
542
|
-
const pod = this.getActivePod();
|
|
778
|
+
async shell(podName = null) {
|
|
779
|
+
const pod = podName ? this.config.pods[podName] : this.getActivePod();
|
|
543
780
|
if (!pod) {
|
|
544
|
-
|
|
781
|
+
if (podName) {
|
|
782
|
+
console.error(`Pod '${podName}' not found`);
|
|
783
|
+
console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
|
|
784
|
+
} else {
|
|
785
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
786
|
+
}
|
|
545
787
|
process.exit(1);
|
|
546
788
|
}
|
|
547
789
|
|
|
548
|
-
console.log(
|
|
790
|
+
console.log(`Connecting to pod${podName ? ` '${podName}'` : ''}...`);
|
|
549
791
|
|
|
550
792
|
// Use spawn directly for interactive shell
|
|
551
793
|
const sshParts = pod.ssh.split(' ');
|
|
@@ -663,13 +905,185 @@ class PrimeIntellectCLI {
|
|
|
663
905
|
}
|
|
664
906
|
}
|
|
665
907
|
|
|
666
|
-
async
|
|
908
|
+
async checkDownloads(podName = null, live = false) {
|
|
909
|
+
// Check only active pod or specified pod
|
|
910
|
+
const targetPod = podName || this.config.active;
|
|
911
|
+
if (!targetPod || !this.config.pods[targetPod]) {
|
|
912
|
+
console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
|
|
913
|
+
process.exit(1);
|
|
914
|
+
}
|
|
915
|
+
|
|
916
|
+
if (!live) {
|
|
917
|
+
// Single check mode
|
|
918
|
+
console.log(`Checking model downloads on pod: ${targetPod}\n`);
|
|
919
|
+
const output = this.ssh('python3 vllm_manager.py downloads', false, false, targetPod);
|
|
920
|
+
|
|
921
|
+
if (output.includes('No HuggingFace cache found') || output.includes('No models in cache')) {
|
|
922
|
+
console.log(output);
|
|
923
|
+
return;
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
// Parse and display
|
|
927
|
+
const downloadInfo = JSON.parse(output);
|
|
928
|
+
this._displayDownloadInfo(downloadInfo);
|
|
929
|
+
} else {
|
|
930
|
+
// Live streaming mode
|
|
931
|
+
const pod = this.config.pods[targetPod];
|
|
932
|
+
// Build SSH command with proper shell invocation
|
|
933
|
+
const sshParts = pod.ssh.split(' ');
|
|
934
|
+
const remoteCmd = 'source .pirc && python3 vllm_manager.py downloads --stream';
|
|
935
|
+
|
|
936
|
+
return new Promise((resolve) => {
|
|
937
|
+
const proc = spawn('ssh', [...sshParts, remoteCmd], { stdio: ['inherit', 'pipe', 'pipe'] });
|
|
938
|
+
|
|
939
|
+
let buffer = '';
|
|
940
|
+
|
|
941
|
+
// Handle Ctrl+C gracefully
|
|
942
|
+
process.on('SIGINT', () => {
|
|
943
|
+
console.log('\n\nStopping download monitor...');
|
|
944
|
+
proc.kill('SIGTERM'); // Send SIGTERM to remote process
|
|
945
|
+
setTimeout(() => {
|
|
946
|
+
proc.kill('SIGKILL'); // Force kill if not terminated
|
|
947
|
+
process.exit(0);
|
|
948
|
+
}, 1000);
|
|
949
|
+
});
|
|
950
|
+
|
|
951
|
+
// Print header once
|
|
952
|
+
console.log(`Monitoring model downloads on pod: ${targetPod} (Press Ctrl+C to stop)`);
|
|
953
|
+
console.log(); // Empty line after header
|
|
954
|
+
|
|
955
|
+
// Hide cursor
|
|
956
|
+
process.stdout.write('\x1B[?25l');
|
|
957
|
+
|
|
958
|
+
// Ensure cursor is shown again on exit
|
|
959
|
+
const cleanup = () => {
|
|
960
|
+
process.stdout.write('\x1B[?25h');
|
|
961
|
+
};
|
|
962
|
+
process.on('exit', cleanup);
|
|
963
|
+
process.on('SIGINT', cleanup);
|
|
964
|
+
|
|
965
|
+
let previousLineCount = 0;
|
|
966
|
+
|
|
967
|
+
proc.stdout.on('data', (data) => {
|
|
968
|
+
buffer += data.toString();
|
|
969
|
+
|
|
970
|
+
// Process complete lines
|
|
971
|
+
const lines = buffer.split('\n');
|
|
972
|
+
buffer = lines[lines.length - 1]; // Keep incomplete line in buffer
|
|
973
|
+
|
|
974
|
+
for (let i = 0; i < lines.length - 1; i++) {
|
|
975
|
+
const line = lines[i].trim();
|
|
976
|
+
if (line) {
|
|
977
|
+
try {
|
|
978
|
+
const downloadInfo = JSON.parse(line);
|
|
979
|
+
|
|
980
|
+
// If we printed lines before, move cursor back up
|
|
981
|
+
if (previousLineCount > 0) {
|
|
982
|
+
process.stdout.write(`\x1B[${previousLineCount}A`); // Move up N lines
|
|
983
|
+
process.stdout.write('\x1B[0J'); // Clear from cursor to end of screen
|
|
984
|
+
}
|
|
985
|
+
|
|
986
|
+
// Build all output as a single string
|
|
987
|
+
let output = '';
|
|
988
|
+
const addLine = (text = '') => {
|
|
989
|
+
output += text + '\n';
|
|
990
|
+
};
|
|
991
|
+
|
|
992
|
+
if (downloadInfo.status === 'NO_CACHE' || downloadInfo.status === 'NO_MODELS') {
|
|
993
|
+
addLine(downloadInfo.message);
|
|
994
|
+
} else {
|
|
995
|
+
// Build the display output
|
|
996
|
+
for (const model of downloadInfo.models) {
|
|
997
|
+
addLine(`Model: ${model.model}`);
|
|
998
|
+
addLine(` Size: ${model.size_gb}GB`);
|
|
999
|
+
|
|
1000
|
+
if (model.total_files > 0) {
|
|
1001
|
+
const percentage = Math.round((model.files / model.total_files) * 100);
|
|
1002
|
+
addLine(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
|
|
1003
|
+
|
|
1004
|
+
// Show progress bar
|
|
1005
|
+
const barLength = 30;
|
|
1006
|
+
const filled = Math.round((percentage / 100) * barLength);
|
|
1007
|
+
const empty = barLength - filled;
|
|
1008
|
+
const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
|
|
1009
|
+
addLine(` Progress: [${progressBar}] ${percentage}%`);
|
|
1010
|
+
} else {
|
|
1011
|
+
addLine(` Files: ${model.files}`);
|
|
1012
|
+
}
|
|
1013
|
+
|
|
1014
|
+
addLine(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
|
|
1015
|
+
addLine(); // Empty line between models
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
if (downloadInfo.vllm_processes > 0) {
|
|
1019
|
+
addLine(`Active vLLM processes: ${downloadInfo.vllm_processes}`);
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
addLine();
|
|
1023
|
+
addLine(`Last updated: ${new Date().toLocaleTimeString()}`);
|
|
1024
|
+
}
|
|
1025
|
+
|
|
1026
|
+
// Write all output at once and count lines
|
|
1027
|
+
process.stdout.write(output);
|
|
1028
|
+
previousLineCount = (output.match(/\n/g) || []).length;
|
|
1029
|
+
|
|
1030
|
+
} catch (e) {
|
|
1031
|
+
// Not JSON, just display as is
|
|
1032
|
+
console.log(line);
|
|
1033
|
+
}
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
});
|
|
1037
|
+
|
|
1038
|
+
proc.stderr.on('data', (data) => {
|
|
1039
|
+
process.stderr.write(data);
|
|
1040
|
+
});
|
|
1041
|
+
|
|
1042
|
+
proc.on('close', () => {
|
|
1043
|
+
cleanup(); // Restore cursor
|
|
1044
|
+
resolve();
|
|
1045
|
+
});
|
|
1046
|
+
});
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
_displayDownloadInfo(downloadInfo) {
|
|
1051
|
+
for (const model of downloadInfo.models) {
|
|
1052
|
+
console.log(`\nModel: ${model.model}`);
|
|
1053
|
+
console.log(` Size: ${model.size_gb}GB`);
|
|
1054
|
+
|
|
1055
|
+
if (model.total_files > 0) {
|
|
1056
|
+
const percentage = Math.round((model.files / model.total_files) * 100);
|
|
1057
|
+
console.log(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
|
|
1058
|
+
|
|
1059
|
+
// Show progress bar
|
|
1060
|
+
const barLength = 30;
|
|
1061
|
+
const filled = Math.round((percentage / 100) * barLength);
|
|
1062
|
+
const empty = barLength - filled;
|
|
1063
|
+
const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
|
|
1064
|
+
console.log(` Progress: [${progressBar}] ${percentage}%`);
|
|
1065
|
+
} else {
|
|
1066
|
+
console.log(` Files: ${model.files}`);
|
|
1067
|
+
}
|
|
1068
|
+
|
|
1069
|
+
console.log(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
|
|
1070
|
+
}
|
|
1071
|
+
|
|
1072
|
+
if (downloadInfo.vllm_processes > 0) {
|
|
1073
|
+
console.log(`\nActive vLLM processes: ${downloadInfo.vllm_processes}`);
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
// Show timestamp
|
|
1077
|
+
console.log(`\nLast updated: ${new Date().toLocaleTimeString()}`);
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
async prompt(name, message, podName = null) {
|
|
667
1081
|
// Get model info
|
|
668
|
-
const models = this.getRunningModels();
|
|
1082
|
+
const models = this.getRunningModels(podName);
|
|
669
1083
|
const model = models[name];
|
|
670
1084
|
|
|
671
1085
|
if (!model || !model.url) {
|
|
672
|
-
console.error(`Model '${name}' is not running`);
|
|
1086
|
+
console.error(`Model '${name}' is not running${podName ? ` on pod '${podName}'` : ''}`);
|
|
673
1087
|
console.error('Running models:', Object.keys(models).join(', ') || 'none');
|
|
674
1088
|
process.exit(1);
|
|
675
1089
|
}
|
|
@@ -703,7 +1117,7 @@ class PrimeIntellectCLI {
|
|
|
703
1117
|
}
|
|
704
1118
|
|
|
705
1119
|
showHelp() {
|
|
706
|
-
console.log('\
|
|
1120
|
+
console.log('\npi CLI\n');
|
|
707
1121
|
|
|
708
1122
|
console.log('Pod Management:');
|
|
709
1123
|
console.log(' pi setup <pod-name> <ssh_command> Configure and activate a pod');
|
|
@@ -711,20 +1125,24 @@ class PrimeIntellectCLI {
|
|
|
711
1125
|
console.log(' pi pod <pod-name> Switch active pod');
|
|
712
1126
|
console.log(' pi pod remove <pod-name> Remove pod from config\n');
|
|
713
1127
|
console.log('Model Management:');
|
|
714
|
-
console.log(' pi list
|
|
1128
|
+
console.log(' pi list [--pod <pod-name>] List running models');
|
|
715
1129
|
console.log(' pi search <query> Search HuggingFace models');
|
|
716
1130
|
console.log(' pi start <model> [options] Start a model');
|
|
717
|
-
console.log(' pi stop [name]
|
|
718
|
-
console.log(' pi logs <name>
|
|
719
|
-
console.log(' pi prompt <name> <msg>
|
|
1131
|
+
console.log(' pi stop [name] [--pod <pod-name>] Stop a model (or all if no name)');
|
|
1132
|
+
console.log(' pi logs <name> [--pod <pod-name>] View model logs');
|
|
1133
|
+
console.log(' pi prompt <name> <msg> [--pod <pod-name>] Chat with a model');
|
|
1134
|
+
console.log(' pi downloads [--pod <pod-name>] [--live] Check model download progress (--live for continuous monitoring)\n');
|
|
720
1135
|
console.log('Start Options:');
|
|
721
1136
|
console.log(' --name <name> Model alias (default: auto-generated)');
|
|
722
1137
|
console.log(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k (default: model default)');
|
|
723
1138
|
console.log(' --memory <percent> GPU memory: 30%, 50%, 90% (default: 90%)');
|
|
724
1139
|
console.log(' --all-gpus Use all GPUs with tensor parallelism');
|
|
1140
|
+
console.log(' --pod <pod-name> Run on specific pod without switching active pod');
|
|
1141
|
+
console.log(' --debug Enable debug logging for vLLM');
|
|
725
1142
|
console.log(' --vllm-args Pass remaining args directly to vLLM\n');
|
|
726
1143
|
console.log('Utility:');
|
|
727
|
-
console.log(' pi shell
|
|
1144
|
+
console.log(' pi shell [--pod <pod-name>] SSH into pod');
|
|
1145
|
+
console.log(' pi ssh [--pod <pod-name>] <cmd> Run SSH command on pod');
|
|
728
1146
|
|
|
729
1147
|
console.log('\nQuick Examples:');
|
|
730
1148
|
console.log(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen');
|
|
@@ -732,7 +1150,7 @@ class PrimeIntellectCLI {
|
|
|
732
1150
|
console.log('\n # Qwen3-Coder on 8xH200 with custom vLLM args:');
|
|
733
1151
|
console.log(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\');
|
|
734
1152
|
console.log(' --data-parallel-size 8 --enable-expert-parallel \\');
|
|
735
|
-
console.log(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --max-model-len 200000');
|
|
1153
|
+
console.log(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
|
|
736
1154
|
|
|
737
1155
|
if (this.config.active && this.config.pods[this.config.active]) {
|
|
738
1156
|
console.log(`\nActive pod: ${this.config.active} (${this.config.pods[this.config.active].ssh})`);
|
|
@@ -741,9 +1159,9 @@ class PrimeIntellectCLI {
|
|
|
741
1159
|
}
|
|
742
1160
|
}
|
|
743
1161
|
|
|
744
|
-
getRunningModels() {
|
|
1162
|
+
getRunningModels(podName = null) {
|
|
745
1163
|
try {
|
|
746
|
-
const output = this.ssh('python3 vllm_manager.py list');
|
|
1164
|
+
const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
|
|
747
1165
|
const models = {};
|
|
748
1166
|
|
|
749
1167
|
// Parse the output to extract model info
|
|
@@ -817,9 +1235,18 @@ class PrimeIntellectCLI {
|
|
|
817
1235
|
break;
|
|
818
1236
|
|
|
819
1237
|
case 'list':
|
|
820
|
-
case 'ls':
|
|
821
|
-
|
|
1238
|
+
case 'ls': {
|
|
1239
|
+
let podName = null;
|
|
1240
|
+
|
|
1241
|
+
// Parse --pod parameter
|
|
1242
|
+
const podIndex = args.indexOf('--pod');
|
|
1243
|
+
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1244
|
+
podName = args[podIndex + 1];
|
|
1245
|
+
}
|
|
1246
|
+
|
|
1247
|
+
this.list(podName);
|
|
822
1248
|
break;
|
|
1249
|
+
}
|
|
823
1250
|
|
|
824
1251
|
case 'search':
|
|
825
1252
|
if (!args[0]) {
|
|
@@ -830,42 +1257,116 @@ class PrimeIntellectCLI {
|
|
|
830
1257
|
await this.searchModels(args[0]);
|
|
831
1258
|
break;
|
|
832
1259
|
|
|
1260
|
+
case 'downloads': {
|
|
1261
|
+
let podName = null;
|
|
1262
|
+
let live = false;
|
|
1263
|
+
|
|
1264
|
+
// Parse --pod parameter
|
|
1265
|
+
const podIndex = args.indexOf('--pod');
|
|
1266
|
+
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1267
|
+
podName = args[podIndex + 1];
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
// Parse --live parameter
|
|
1271
|
+
if (args.includes('--live')) {
|
|
1272
|
+
live = true;
|
|
1273
|
+
}
|
|
1274
|
+
|
|
1275
|
+
await this.checkDownloads(podName, live);
|
|
1276
|
+
break;
|
|
1277
|
+
}
|
|
1278
|
+
|
|
833
1279
|
case 'start':
|
|
834
1280
|
await this.handleStart(args);
|
|
835
1281
|
break;
|
|
836
1282
|
|
|
837
|
-
case 'stop':
|
|
838
|
-
|
|
1283
|
+
case 'stop': {
|
|
1284
|
+
let modelName = args[0];
|
|
1285
|
+
let podName = null;
|
|
1286
|
+
|
|
1287
|
+
// Parse --pod parameter
|
|
1288
|
+
const podIndex = args.indexOf('--pod');
|
|
1289
|
+
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1290
|
+
podName = args[podIndex + 1];
|
|
1291
|
+
// Remove --pod and its value from args
|
|
1292
|
+
args.splice(podIndex, 2);
|
|
1293
|
+
modelName = args[0]; // Update modelName after removing --pod
|
|
1294
|
+
}
|
|
1295
|
+
|
|
1296
|
+
this.stop(modelName, podName);
|
|
839
1297
|
break;
|
|
1298
|
+
}
|
|
1299
|
+
|
|
1300
|
+
case 'logs': {
|
|
1301
|
+
let modelName = args[0];
|
|
1302
|
+
let podName = null;
|
|
1303
|
+
|
|
1304
|
+
// Parse --pod parameter
|
|
1305
|
+
const podIndex = args.indexOf('--pod');
|
|
1306
|
+
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1307
|
+
podName = args[podIndex + 1];
|
|
1308
|
+
// Remove --pod and its value from args
|
|
1309
|
+
args.splice(podIndex, 2);
|
|
1310
|
+
modelName = args[0]; // Update modelName after removing --pod
|
|
1311
|
+
}
|
|
840
1312
|
|
|
841
|
-
|
|
842
|
-
await this.logs(args[0], false); // autoExit = false for manual logs command
|
|
1313
|
+
await this.logs(modelName, false, podName); // autoExit = false for manual logs command
|
|
843
1314
|
break;
|
|
1315
|
+
}
|
|
844
1316
|
|
|
845
1317
|
case 'prompt': {
|
|
846
1318
|
if (args.length < 2) {
|
|
847
|
-
console.error('Usage: pi prompt <model_name> "<message>"');
|
|
1319
|
+
console.error('Usage: pi prompt <model_name> "<message>" [--pod <pod-name>]');
|
|
848
1320
|
console.error('Example: pi prompt phi3 "Hey, how you going"');
|
|
849
1321
|
process.exit(1);
|
|
850
1322
|
}
|
|
851
|
-
|
|
1323
|
+
let modelName = args[0];
|
|
1324
|
+
let podName = null;
|
|
1325
|
+
|
|
1326
|
+
// Parse --pod parameter
|
|
1327
|
+
const podIndex = args.indexOf('--pod');
|
|
1328
|
+
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1329
|
+
podName = args[podIndex + 1];
|
|
1330
|
+
// Remove --pod and its value from args
|
|
1331
|
+
args.splice(podIndex, 2);
|
|
1332
|
+
}
|
|
1333
|
+
|
|
852
1334
|
const message = args.slice(1).join(' ');
|
|
853
|
-
this.prompt(modelName, message);
|
|
1335
|
+
this.prompt(modelName, message, podName);
|
|
854
1336
|
break;
|
|
855
1337
|
}
|
|
856
|
-
case 'shell':
|
|
857
|
-
|
|
1338
|
+
case 'shell': {
|
|
1339
|
+
let podName = null;
|
|
1340
|
+
|
|
1341
|
+
// Parse --pod parameter
|
|
1342
|
+
const podIndex = args.indexOf('--pod');
|
|
1343
|
+
if (podIndex !== -1 && args[podIndex + 1]) {
|
|
1344
|
+
podName = args[podIndex + 1];
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
await this.shell(podName);
|
|
858
1348
|
break;
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
case 'ssh': {
|
|
1352
|
+
let podName = null;
|
|
1353
|
+
let sshArgs = [...args];
|
|
1354
|
+
|
|
1355
|
+
// For ssh, --pod must be the first parameter if present
|
|
1356
|
+
if (args[0] === '--pod' && args[1]) {
|
|
1357
|
+
podName = args[1];
|
|
1358
|
+
sshArgs = args.slice(2); // Remove --pod and podName from args
|
|
1359
|
+
}
|
|
859
1360
|
|
|
860
|
-
case 'ssh':
|
|
861
1361
|
// Pass through any SSH command
|
|
862
|
-
if (
|
|
863
|
-
const output = this.ssh(
|
|
1362
|
+
if (sshArgs.length > 0) {
|
|
1363
|
+
const output = this.ssh(sshArgs.join(' '), false, false, podName);
|
|
864
1364
|
console.log(output);
|
|
865
1365
|
} else {
|
|
866
|
-
this.shell();
|
|
1366
|
+
await this.shell(podName);
|
|
867
1367
|
}
|
|
868
1368
|
break;
|
|
1369
|
+
}
|
|
869
1370
|
|
|
870
1371
|
default:
|
|
871
1372
|
this.showHelp();
|
|
@@ -874,5 +1375,5 @@ class PrimeIntellectCLI {
|
|
|
874
1375
|
}
|
|
875
1376
|
|
|
876
1377
|
// Run CLI
|
|
877
|
-
const cli = new
|
|
1378
|
+
const cli = new PiCli();
|
|
878
1379
|
cli.run().catch(console.error);
|