@mariozechner/pi 0.1.5 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/pi.js CHANGED
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env node
2
2
  /**
3
- * Prime Intellect CLI - All-in-one pod management
3
+ * pi CLI
4
4
  */
5
5
 
6
6
  const fs = require('fs');
@@ -11,7 +11,7 @@ const os = require('os');
11
11
  const CONFIG_FILE = path.join(os.homedir(), '.pi_config');
12
12
  const SCRIPT_DIR = __dirname;
13
13
 
14
- class PrimeIntellectCLI {
14
+ class PiCli {
15
15
  constructor() {
16
16
  this.loadConfig();
17
17
  }
@@ -43,12 +43,17 @@ class PrimeIntellectCLI {
43
43
  return this.config.pods[this.config.active];
44
44
  }
45
45
 
46
- ssh(command, interactive = false, skipPirc = false) {
47
- const pod = this.getActivePod();
46
+ ssh(command, interactive = false, skipPirc = false, podName = null) {
47
+ const pod = podName ? this.config.pods[podName] : this.getActivePod();
48
48
  if (!pod) {
49
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
50
- console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
51
- console.error('Or activate an existing pod: pi pod <pod-name>');
49
+ if (podName) {
50
+ console.error(`Pod '${podName}' not found`);
51
+ console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
52
+ } else {
53
+ console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
54
+ console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
55
+ console.error('Or activate an existing pod: pi pod <pod-name>');
56
+ }
52
57
  process.exit(1);
53
58
  }
54
59
 
@@ -79,10 +84,14 @@ class PrimeIntellectCLI {
79
84
  }
80
85
  }
81
86
 
82
- scp(localFile, remotePath = '~/') {
83
- const pod = this.getActivePod();
87
+ scp(localFile, remotePath = '~/', podName = null) {
88
+ const pod = podName ? this.config.pods[podName] : this.getActivePod();
84
89
  if (!pod) {
85
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
90
+ if (podName) {
91
+ console.error(`Pod '${podName}' not found`);
92
+ } else {
93
+ console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
94
+ }
86
95
  process.exit(1);
87
96
  }
88
97
 
@@ -159,8 +168,8 @@ class PrimeIntellectCLI {
159
168
  this.showHelp();
160
169
  }
161
170
 
162
- list() {
163
- const output = this.ssh('python3 vllm_manager.py list');
171
+ list(podName = null) {
172
+ const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
164
173
  console.log(output);
165
174
  }
166
175
 
@@ -207,6 +216,8 @@ class PrimeIntellectCLI {
207
216
  console.error(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k or 4096, 8192, etc (default: model default)');
208
217
  console.error(' --memory <percent> GPU memory: 30%, 50%, 90% or 0.3, 0.5, 0.9 (default: 90%)');
209
218
  console.error(' --all-gpus Use all GPUs with tensor parallelism (ignores --memory)');
219
+ console.error(' --debug Enable debug logging for vLLM');
220
+ console.error(' --pod <name> Run on specific pod (default: active pod)');
210
221
  console.error(' --vllm-args Pass remaining args directly to vLLM (ignores other options)');
211
222
  console.error('');
212
223
  console.error('Examples:');
@@ -216,9 +227,9 @@ class PrimeIntellectCLI {
216
227
  console.error(' pi start meta-llama/Llama-3.1-405B --all-gpus --context 128k');
217
228
  console.error('');
218
229
  console.error(' # Custom vLLM args for Qwen3-Coder on 8xH200:');
219
- console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\');
220
- console.error(' --data-parallel-size 8 --enable-expert-parallel \\');
221
- console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.9 --max-model-len 200000');
230
+ console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\\\');
231
+ console.error(' --data-parallel-size 8 --enable-expert-parallel \\\\');
232
+ console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
222
233
  process.exit(1);
223
234
  }
224
235
 
@@ -227,15 +238,21 @@ class PrimeIntellectCLI {
227
238
  let context = null; // Changed to null - let vLLM use model default
228
239
  let memory = 0.9;
229
240
  let allGpus = false;
241
+ let debug = false;
230
242
  let vllmArgs = null;
243
+ let podName = null;
231
244
 
232
245
  // Check for --vllm-args first
233
246
  const vllmArgsIndex = args.indexOf('--vllm-args');
234
247
  if (vllmArgsIndex !== -1) {
235
- // Extract name if provided before --vllm-args
248
+ // Extract name and pod if provided before --vllm-args
236
249
  for (let i = 1; i < vllmArgsIndex; i++) {
237
250
  if (args[i] === '--name' && args[i + 1]) {
238
251
  name = args[++i];
252
+ } else if (args[i] === '--pod' && args[i + 1]) {
253
+ podName = args[++i];
254
+ } else if (args[i] === '--debug') {
255
+ debug = true;
239
256
  }
240
257
  }
241
258
  // Everything after --vllm-args is passed to vLLM
@@ -256,6 +273,12 @@ class PrimeIntellectCLI {
256
273
  case '--all-gpus':
257
274
  allGpus = true;
258
275
  break;
276
+ case '--debug':
277
+ debug = true;
278
+ break;
279
+ case '--pod':
280
+ podName = args[++i];
281
+ break;
259
282
  default:
260
283
  console.error(`Unknown option: ${args[i]}`);
261
284
  process.exit(1);
@@ -264,7 +287,7 @@ class PrimeIntellectCLI {
264
287
  }
265
288
 
266
289
  // Check for multi-GPU setup
267
- const gpuCount = await this.getGpuCount();
290
+ const gpuCount = await this.getGpuCount(podName);
268
291
 
269
292
  if (allGpus) {
270
293
  if (memory !== 0.9) {
@@ -292,29 +315,170 @@ class PrimeIntellectCLI {
292
315
  .slice(0, 20);
293
316
  }
294
317
 
295
- // If vllmArgs provided, use raw vLLM command
318
+ // If vllmArgs provided, skip memory check since we don't know the parallelism strategy
296
319
  if (vllmArgs) {
297
- await this.startRaw(modelId, name, vllmArgs);
298
- } else {
299
- // Call the original start method with positional args
300
- const contextStr = context ? context.toString() : null;
301
- await this.start(modelId, name, contextStr, memory.toString(), { allGpus, gpuCount });
320
+ const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
321
+ if (modelEstimate) {
322
+ console.log(`Model weights: ${modelEstimate.modelSizeGB.toFixed(1)}GB`);
323
+ console.log(`Context length: ${modelEstimate.contextLength.toLocaleString()} tokens`);
324
+ }
325
+ console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
326
+ await this.startRaw(modelId, name, vllmArgs, debug, podName);
327
+ return;
328
+ }
329
+
330
+ // For standard deployment, check memory
331
+ const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
332
+
333
+ // Check GPU memory before starting
334
+ console.log('Checking model size and GPU memory...');
335
+ console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
336
+ const [memoryInfo, modelEstimateWithContext] = await Promise.all([
337
+ this.getGpuMemoryInfo(podName),
338
+ modelEstimate
339
+ ]);
340
+
341
+ if (memoryInfo && modelEstimateWithContext) {
342
+ // For tensor parallel (--all-gpus), memory is distributed across GPUs
343
+ const effectiveMemoryNeeded = allGpus && gpuCount > 1
344
+ ? modelEstimateWithContext.estimatedMemoryGB / gpuCount
345
+ : modelEstimateWithContext.estimatedMemoryGB;
346
+
347
+ const memoryPerGpu = memoryInfo.freeMemoryGB / (gpuCount || 1);
348
+
349
+ console.log(`Model weights: ${modelEstimateWithContext.modelSizeGB.toFixed(1)}GB`);
350
+ console.log(`Context length: ${modelEstimateWithContext.contextLength.toLocaleString()} tokens`);
351
+ console.log(`Note: Estimate includes model parameters only, not KV cache for context`);
352
+ console.log(`Available GPU memory: ${memoryInfo.freeMemoryGB.toFixed(1)}GB total (${memoryPerGpu.toFixed(1)}GB per GPU)`);
353
+
354
+ if (effectiveMemoryNeeded > memoryPerGpu) {
355
+ // Log a BIG WARNING as requested
356
+ console.error(`\n❌ BIG WARNING: Insufficient GPU memory`);
357
+ if (allGpus && gpuCount > 1) {
358
+ console.error(` Model needs ~${effectiveMemoryNeeded.toFixed(1)}GB per GPU but only ${memoryPerGpu.toFixed(1)}GB available`);
359
+ } else {
360
+ console.error(` Model needs ~${modelEstimateWithContext.estimatedMemoryGB.toFixed(1)}GB but only ${memoryInfo.freeMemoryGB.toFixed(1)}GB available`);
361
+ }
362
+ console.error('\n Free up memory by stopping running models:');
363
+ console.error(' pi list # See running models');
364
+ console.error(' pi stop <model_name> # Stop specific model');
365
+ console.error(' pi stop # Stop all models\n');
366
+ // Don't exit, just warn and proceed
367
+ }
302
368
  }
369
+
370
+ // Call the original start method with positional args
371
+ const contextStr = context ? context.toString() : null;
372
+ await this.start(modelId, name, contextStr, memory.toString(), { allGpus, gpuCount, debug, podName });
303
373
  }
304
374
 
305
- async getGpuCount() {
375
+ async getGpuCount(podName = null) {
306
376
  try {
307
- const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l');
377
+ const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l', false, false, podName);
308
378
  return parseInt(output.trim()) || 1;
309
379
  } catch {
310
380
  return 1;
311
381
  }
312
382
  }
313
383
 
384
+ async getGpuMemoryInfo(podName = null) {
385
+ try {
386
+ const output = this.ssh('nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits', false, false, podName);
387
+ const lines = output.trim().split('\n');
388
+ let totalMemoryGB = 0;
389
+ let freeMemoryGB = 0;
390
+
391
+ for (const line of lines) {
392
+ const [total, free] = line.split(',').map(x => parseInt(x.trim()));
393
+ totalMemoryGB += total / 1024;
394
+ freeMemoryGB += free / 1024;
395
+ }
396
+
397
+ return { totalMemoryGB, freeMemoryGB };
398
+ } catch (e) {
399
+ return null;
400
+ }
401
+ }
402
+
403
+ async getModelMemoryEstimate(modelId, contextLength = null) {
404
+ try {
405
+ const response = await fetch(`https://huggingface.co/api/models/${modelId}`);
406
+ const data = await response.json();
407
+
408
+ if (data.safetensors?.parameters) {
409
+ // Calculate actual model size based on parameter counts and types
410
+ const dtypeSizes = {
411
+ 'F64': 8, // float64 - 8 bytes
412
+ 'F32': 4, // float32 - 4 bytes
413
+ 'BF16': 2, // bfloat16 - 2 bytes
414
+ 'F16': 2, // float16 - 2 bytes
415
+ 'I32': 4, // int32 - 4 bytes
416
+ 'I16': 2, // int16 - 2 bytes
417
+ 'I8': 1, // int8 - 1 byte
418
+ 'U8': 1, // uint8 - 1 byte
419
+ 'I4': 0.5, // int4 - 0.5 bytes (packed)
420
+ 'F8_E4M3': 1, // FP8 E4M3 format - 1 byte
421
+ 'F8_E5M2': 1, // FP8 E5M2 format - 1 byte
422
+ 'Q8_0': 1, // GGML quantization formats
423
+ 'Q4_0': 0.5, // GGML quantization formats
424
+ 'Q4_1': 0.5, // GGML quantization formats
425
+ 'Q5_0': 0.625, // GGML quantization formats
426
+ 'Q5_1': 0.625 // GGML quantization formats
427
+ };
428
+
429
+ let totalBytes = 0;
430
+ let paramDetails = [];
431
+
432
+ // Calculate bytes for each dtype
433
+ let unknownDtypes = [];
434
+ for (const [dtype, paramCount] of Object.entries(data.safetensors.parameters)) {
435
+ let bytesPerParam = dtypeSizes[dtype];
436
+ if (bytesPerParam === undefined) {
437
+ // Unknown dtype - assume 1 byte (most new formats are quantized)
438
+ bytesPerParam = 1; // Conservative for memory checking
439
+ unknownDtypes.push(dtype);
440
+ }
441
+ const bytes = paramCount * bytesPerParam;
442
+ totalBytes += bytes;
443
+ paramDetails.push({ dtype, count: paramCount, bytes });
444
+ }
445
+
446
+ if (unknownDtypes.length > 0) {
447
+ console.warn(`Unknown dtype(s) found: ${unknownDtypes.join(', ')}. Assuming 1 byte per parameter.`);
448
+ }
449
+
450
+ const modelSizeGB = totalBytes / (1024 ** 3);
451
+
452
+ // Try to get model config for context length
453
+ let maxContextLength = contextLength;
454
+ try {
455
+ const configResponse = await fetch(`https://huggingface.co/${modelId}/raw/main/config.json`);
456
+ if (configResponse.ok) {
457
+ const config = await configResponse.json();
458
+ maxContextLength = contextLength || config.max_position_embeddings || 8192;
459
+ }
460
+ } catch (e) {
461
+ maxContextLength = contextLength || 8192;
462
+ }
463
+
464
+ return {
465
+ modelSizeGB,
466
+ estimatedMemoryGB: modelSizeGB, // Only model weights, not KV cache
467
+ contextLength: maxContextLength,
468
+ paramDetails // For debugging
469
+ };
470
+ }
471
+
472
+ return null;
473
+ } catch (e) {
474
+ return null;
475
+ }
476
+ }
477
+
314
478
  async start(modelId, name, maxLen = null, gpuMemory, options = {}) {
315
479
  // Check if name is already in use locally first
316
480
  if (name) {
317
- const runningModels = this.getRunningModels();
481
+ const runningModels = this.getRunningModels(options.podName);
318
482
  if (runningModels[name]) {
319
483
  console.error(`Error: Model name '${name}' is already in use`);
320
484
  console.error('Running models:', Object.keys(runningModels).join(', '));
@@ -322,6 +486,8 @@ class PrimeIntellectCLI {
322
486
  }
323
487
  }
324
488
 
489
+ // Memory check is already done in handleStart, skip it here
490
+
325
491
  // Build args for vllm_manager.py
326
492
  let args = modelId;
327
493
 
@@ -344,7 +510,12 @@ class PrimeIntellectCLI {
344
510
  args += ` ${options.gpuCount}`; // Pass tensor parallel size
345
511
  }
346
512
 
347
- const output = this.ssh(`${envPrefix}python3 vllm_manager.py start ${args}`);
513
+ // Add debug logging if requested
514
+ if (options.debug) {
515
+ envPrefix = 'VLLM_LOGGING_LEVEL=DEBUG ';
516
+ }
517
+
518
+ const output = this.ssh(`${envPrefix}python3 vllm_manager.py start ${args}`, false, false, options.podName);
348
519
 
349
520
  // Extract model name and connection info from output
350
521
  const nameMatch = output.match(/Started (\S+)/);
@@ -370,6 +541,7 @@ class PrimeIntellectCLI {
370
541
  console.log(`\nTo use with OpenAI clients:`);
371
542
  console.log(exportCmd);
372
543
  console.log(`export OPENAI_API_KEY='dummy'`);
544
+ console.log(`export OPENAI_MODEL='${modelId}'`);
373
545
  }
374
546
  console.log('='.repeat(60));
375
547
  };
@@ -380,29 +552,58 @@ class PrimeIntellectCLI {
380
552
  });
381
553
 
382
554
  // Watch logs until startup complete
383
- await this.logs(modelName, true); // autoExit = true for startup
555
+ await this.logs(modelName, true, options.podName); // autoExit = true for startup
384
556
 
385
- // Show model info after automatic exit
557
+ // Warm up the model with a simple prompt
558
+ console.log('\nWarming up model...');
559
+ try {
560
+ const warmupUrl = `${url}/chat/completions`;
561
+ const warmupPayload = {
562
+ model: modelId,
563
+ messages: [{ role: 'user', content: 'Hi' }],
564
+ max_tokens: 1,
565
+ temperature: 0
566
+ };
567
+
568
+ const warmupResponse = await fetch(warmupUrl, {
569
+ method: 'POST',
570
+ headers: { 'Content-Type': 'application/json' },
571
+ body: JSON.stringify(warmupPayload)
572
+ });
573
+
574
+ if (warmupResponse.ok) {
575
+ console.log('✓ Model warmed up and ready!');
576
+ } else {
577
+ console.log('⚠ Warmup failed, but model should still work');
578
+ }
579
+ } catch (e) {
580
+ console.log('⚠ Could not warm up model:', e.message);
581
+ }
582
+
583
+ // Show model info after warmup
386
584
  showModelInfo();
387
585
  } else {
388
586
  console.log(output);
389
587
  }
390
588
  }
391
589
 
392
- async startRaw(modelId, name, vllmArgs) {
590
+ async startRaw(modelId, name, vllmArgs, debug = false, podName = null) {
591
+ // Skip memory check for raw vLLM args since we don't know what custom settings are used
592
+ console.log('Note: Memory checking disabled when using --vllm-args');
393
593
  // Check if name is already in use
394
- const runningModels = this.getRunningModels();
594
+ const runningModels = this.getRunningModels(podName);
395
595
  if (runningModels[name]) {
396
596
  console.error(`Error: Model name '${name}' is already in use`);
397
597
  console.error('Running models:', Object.keys(runningModels).join(', '));
398
598
  process.exit(1);
399
599
  }
400
600
 
401
- console.log(`Starting ${name} with custom vLLM args...`);
601
+ console.log(`Starting ${name} with custom vLLM args on pod: ${podName || this.config.active || 'active pod'}`);
402
602
 
403
603
  // Start vLLM with raw arguments - use base64 to safely pass complex args
404
604
  const base64Args = Buffer.from(vllmArgs).toString('base64');
405
- const output = this.ssh(`python3 vllm_manager.py start_raw "${modelId}" "${name}" "${base64Args}"`);
605
+ const envPrefix = debug ? 'VLLM_LOGGING_LEVEL=DEBUG ' : '';
606
+ const output = this.ssh(`${envPrefix}python3 vllm_manager.py start_raw "${modelId}" "${name}" "${base64Args}"`, false, false, podName);
406
607
 
407
608
  // Extract connection info from output
408
609
  const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
@@ -426,6 +627,7 @@ class PrimeIntellectCLI {
426
627
  console.log(`\nTo use with OpenAI clients:`);
427
628
  console.log(exportCmd);
428
629
  console.log(`export OPENAI_API_KEY='dummy'`);
630
+ console.log(`export OPENAI_MODEL='${modelId}'`);
429
631
  }
430
632
  console.log('='.repeat(60));
431
633
  };
@@ -436,47 +638,80 @@ class PrimeIntellectCLI {
436
638
  });
437
639
 
438
640
  // Watch logs until startup complete
439
- await this.logs(name, true); // autoExit = true for startup
641
+ await this.logs(name, true, podName); // autoExit = true for startup
642
+
643
+ // Warm up the model with a simple prompt
644
+ console.log('\nWarming up model...');
645
+ try {
646
+ const warmupUrl = `${url}/chat/completions`;
647
+ const warmupPayload = {
648
+ model: modelId,
649
+ messages: [{ role: 'user', content: 'Hi' }],
650
+ max_tokens: 1,
651
+ temperature: 0
652
+ };
653
+
654
+ const warmupResponse = await fetch(warmupUrl, {
655
+ method: 'POST',
656
+ headers: { 'Content-Type': 'application/json' },
657
+ body: JSON.stringify(warmupPayload)
658
+ });
440
659
 
441
- // Show model info after automatic exit
660
+ if (warmupResponse.ok) {
661
+ console.log('✓ Model warmed up and ready!');
662
+ } else {
663
+ console.log('⚠ Warmup failed, but model should still work');
664
+ }
665
+ } catch (e) {
666
+ console.log('⚠ Could not warm up model:', e.message);
667
+ }
668
+
669
+ // Show model info after warmup
442
670
  showModelInfo();
443
671
  } else {
444
672
  console.log(output);
445
673
  }
446
674
  }
447
675
 
448
- stop(name) {
676
+ stop(name, podName = null) {
449
677
  if (!name) {
450
678
  // Stop all models
451
- const runningModels = this.getRunningModels();
679
+ const runningModels = this.getRunningModels(podName);
452
680
  const modelNames = Object.keys(runningModels);
453
681
 
454
682
  if (modelNames.length === 0) {
455
683
  console.log('No models running');
684
+ // Still clean up any hanging vLLM processes
685
+ console.log('Cleaning up any remaining vLLM processes...');
686
+ this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
456
687
  return;
457
688
  }
458
689
 
459
690
  console.log(`Stopping ${modelNames.length} model(s): ${modelNames.join(', ')}`);
460
691
 
461
692
  for (const modelName of modelNames) {
462
- const output = this.ssh(`python3 vllm_manager.py stop ${modelName}`);
693
+ const output = this.ssh(`python3 vllm_manager.py stop ${modelName}`, false, false, podName);
463
694
  console.log(output);
464
695
  }
696
+
697
+ // Final cleanup of vLLM processes after stopping all models
698
+ console.log('Ensuring all vLLM processes are terminated...');
699
+ this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
465
700
  } else {
466
701
  // Stop specific model
467
- const output = this.ssh(`python3 vllm_manager.py stop ${name}`);
702
+ const output = this.ssh(`python3 vllm_manager.py stop ${name}`, false, false, podName);
468
703
  console.log(output);
469
704
  }
470
705
  }
471
706
 
472
- async logs(name, autoExit = false) {
707
+ async logs(name, autoExit = false, podName = null) {
473
708
  if (!name) {
474
709
  console.error('Usage: pi logs <name>');
475
710
  process.exit(1);
476
711
  }
477
712
 
478
713
  // Use vllm_manager.py to get the log file path
479
- const infoOutput = this.ssh(`python3 vllm_manager.py list`);
714
+ const infoOutput = this.ssh(`python3 vllm_manager.py list`, false, false, podName);
480
715
 
481
716
  // Extract log file path from the output
482
717
  const lines = infoOutput.split('\n');
@@ -498,8 +733,10 @@ class PrimeIntellectCLI {
498
733
  }
499
734
 
500
735
  // Use a custom tail that watches for startup complete
501
- const pod = this.getActivePod();
502
- const sshCmd = `ssh ${pod.ssh} tail -n 50 -f ${logFile}`;
736
+ const pod = podName ? this.config.pods[podName] : this.getActivePod();
737
+ // Add SSH options to prevent connection issues
738
+ const sshOpts = '-o ServerAliveInterval=5 -o ServerAliveCountMax=3 -o TCPKeepAlive=yes';
739
+ const sshCmd = `ssh ${sshOpts} ${pod.ssh} tail -n 50 -f ${logFile}`;
503
740
 
504
741
  return new Promise((resolve) => {
505
742
  const [cmd, ...args] = sshCmd.split(' ');
@@ -538,14 +775,19 @@ class PrimeIntellectCLI {
538
775
  });
539
776
  }
540
777
 
541
- async shell() {
542
- const pod = this.getActivePod();
778
+ async shell(podName = null) {
779
+ const pod = podName ? this.config.pods[podName] : this.getActivePod();
543
780
  if (!pod) {
544
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
781
+ if (podName) {
782
+ console.error(`Pod '${podName}' not found`);
783
+ console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
784
+ } else {
785
+ console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
786
+ }
545
787
  process.exit(1);
546
788
  }
547
789
 
548
- console.log('Connecting to pod...');
790
+ console.log(`Connecting to pod${podName ? ` '${podName}'` : ''}...`);
549
791
 
550
792
  // Use spawn directly for interactive shell
551
793
  const sshParts = pod.ssh.split(' ');
@@ -663,13 +905,185 @@ class PrimeIntellectCLI {
663
905
  }
664
906
  }
665
907
 
666
- async prompt(name, message) {
908
+ async checkDownloads(podName = null, live = false) {
909
+ // Check only active pod or specified pod
910
+ const targetPod = podName || this.config.active;
911
+ if (!targetPod || !this.config.pods[targetPod]) {
912
+ console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
913
+ process.exit(1);
914
+ }
915
+
916
+ if (!live) {
917
+ // Single check mode
918
+ console.log(`Checking model downloads on pod: ${targetPod}\n`);
919
+ const output = this.ssh('python3 vllm_manager.py downloads', false, false, targetPod);
920
+
921
+ if (output.includes('No HuggingFace cache found') || output.includes('No models in cache')) {
922
+ console.log(output);
923
+ return;
924
+ }
925
+
926
+ // Parse and display
927
+ const downloadInfo = JSON.parse(output);
928
+ this._displayDownloadInfo(downloadInfo);
929
+ } else {
930
+ // Live streaming mode
931
+ const pod = this.config.pods[targetPod];
932
+ // Build SSH command with proper shell invocation
933
+ const sshParts = pod.ssh.split(' ');
934
+ const remoteCmd = 'source .pirc && python3 vllm_manager.py downloads --stream';
935
+
936
+ return new Promise((resolve) => {
937
+ const proc = spawn('ssh', [...sshParts, remoteCmd], { stdio: ['inherit', 'pipe', 'pipe'] });
938
+
939
+ let buffer = '';
940
+
941
+ // Handle Ctrl+C gracefully
942
+ process.on('SIGINT', () => {
943
+ console.log('\n\nStopping download monitor...');
944
+ proc.kill('SIGTERM'); // Send SIGTERM to remote process
945
+ setTimeout(() => {
946
+ proc.kill('SIGKILL'); // Force kill if not terminated
947
+ process.exit(0);
948
+ }, 1000);
949
+ });
950
+
951
+ // Print header once
952
+ console.log(`Monitoring model downloads on pod: ${targetPod} (Press Ctrl+C to stop)`);
953
+ console.log(); // Empty line after header
954
+
955
+ // Hide cursor
956
+ process.stdout.write('\x1B[?25l');
957
+
958
+ // Ensure cursor is shown again on exit
959
+ const cleanup = () => {
960
+ process.stdout.write('\x1B[?25h');
961
+ };
962
+ process.on('exit', cleanup);
963
+ process.on('SIGINT', cleanup);
964
+
965
+ let previousLineCount = 0;
966
+
967
+ proc.stdout.on('data', (data) => {
968
+ buffer += data.toString();
969
+
970
+ // Process complete lines
971
+ const lines = buffer.split('\n');
972
+ buffer = lines[lines.length - 1]; // Keep incomplete line in buffer
973
+
974
+ for (let i = 0; i < lines.length - 1; i++) {
975
+ const line = lines[i].trim();
976
+ if (line) {
977
+ try {
978
+ const downloadInfo = JSON.parse(line);
979
+
980
+ // If we printed lines before, move cursor back up
981
+ if (previousLineCount > 0) {
982
+ process.stdout.write(`\x1B[${previousLineCount}A`); // Move up N lines
983
+ process.stdout.write('\x1B[0J'); // Clear from cursor to end of screen
984
+ }
985
+
986
+ // Build all output as a single string
987
+ let output = '';
988
+ const addLine = (text = '') => {
989
+ output += text + '\n';
990
+ };
991
+
992
+ if (downloadInfo.status === 'NO_CACHE' || downloadInfo.status === 'NO_MODELS') {
993
+ addLine(downloadInfo.message);
994
+ } else {
995
+ // Build the display output
996
+ for (const model of downloadInfo.models) {
997
+ addLine(`Model: ${model.model}`);
998
+ addLine(` Size: ${model.size_gb}GB`);
999
+
1000
+ if (model.total_files > 0) {
1001
+ const percentage = Math.round((model.files / model.total_files) * 100);
1002
+ addLine(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
1003
+
1004
+ // Show progress bar
1005
+ const barLength = 30;
1006
+ const filled = Math.round((percentage / 100) * barLength);
1007
+ const empty = barLength - filled;
1008
+ const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
1009
+ addLine(` Progress: [${progressBar}] ${percentage}%`);
1010
+ } else {
1011
+ addLine(` Files: ${model.files}`);
1012
+ }
1013
+
1014
+ addLine(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
1015
+ addLine(); // Empty line between models
1016
+ }
1017
+
1018
+ if (downloadInfo.vllm_processes > 0) {
1019
+ addLine(`Active vLLM processes: ${downloadInfo.vllm_processes}`);
1020
+ }
1021
+
1022
+ addLine();
1023
+ addLine(`Last updated: ${new Date().toLocaleTimeString()}`);
1024
+ }
1025
+
1026
+ // Write all output at once and count lines
1027
+ process.stdout.write(output);
1028
+ previousLineCount = (output.match(/\n/g) || []).length;
1029
+
1030
+ } catch (e) {
1031
+ // Not JSON, just display as is
1032
+ console.log(line);
1033
+ }
1034
+ }
1035
+ }
1036
+ });
1037
+
1038
+ proc.stderr.on('data', (data) => {
1039
+ process.stderr.write(data);
1040
+ });
1041
+
1042
+ proc.on('close', () => {
1043
+ cleanup(); // Restore cursor
1044
+ resolve();
1045
+ });
1046
+ });
1047
+ }
1048
+ }
1049
+
1050
+ _displayDownloadInfo(downloadInfo) {
1051
+ for (const model of downloadInfo.models) {
1052
+ console.log(`\nModel: ${model.model}`);
1053
+ console.log(` Size: ${model.size_gb}GB`);
1054
+
1055
+ if (model.total_files > 0) {
1056
+ const percentage = Math.round((model.files / model.total_files) * 100);
1057
+ console.log(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
1058
+
1059
+ // Show progress bar
1060
+ const barLength = 30;
1061
+ const filled = Math.round((percentage / 100) * barLength);
1062
+ const empty = barLength - filled;
1063
+ const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
1064
+ console.log(` Progress: [${progressBar}] ${percentage}%`);
1065
+ } else {
1066
+ console.log(` Files: ${model.files}`);
1067
+ }
1068
+
1069
+ console.log(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
1070
+ }
1071
+
1072
+ if (downloadInfo.vllm_processes > 0) {
1073
+ console.log(`\nActive vLLM processes: ${downloadInfo.vllm_processes}`);
1074
+ }
1075
+
1076
+ // Show timestamp
1077
+ console.log(`\nLast updated: ${new Date().toLocaleTimeString()}`);
1078
+ }
1079
+
1080
+ async prompt(name, message, podName = null) {
667
1081
  // Get model info
668
- const models = this.getRunningModels();
1082
+ const models = this.getRunningModels(podName);
669
1083
  const model = models[name];
670
1084
 
671
1085
  if (!model || !model.url) {
672
- console.error(`Model '${name}' is not running`);
1086
+ console.error(`Model '${name}' is not running${podName ? ` on pod '${podName}'` : ''}`);
673
1087
  console.error('Running models:', Object.keys(models).join(', ') || 'none');
674
1088
  process.exit(1);
675
1089
  }
@@ -703,7 +1117,7 @@ class PrimeIntellectCLI {
703
1117
  }
704
1118
 
705
1119
  showHelp() {
706
- console.log('\nPrime Intellect CLI\n');
1120
+ console.log('\npi CLI\n');
707
1121
 
708
1122
  console.log('Pod Management:');
709
1123
  console.log(' pi setup <pod-name> <ssh_command> Configure and activate a pod');
@@ -711,20 +1125,24 @@ class PrimeIntellectCLI {
711
1125
  console.log(' pi pod <pod-name> Switch active pod');
712
1126
  console.log(' pi pod remove <pod-name> Remove pod from config\n');
713
1127
  console.log('Model Management:');
714
- console.log(' pi list List running models');
1128
+ console.log(' pi list [--pod <pod-name>] List running models');
715
1129
  console.log(' pi search <query> Search HuggingFace models');
716
1130
  console.log(' pi start <model> [options] Start a model');
717
- console.log(' pi stop [name] Stop a model (or all if no name)');
718
- console.log(' pi logs <name> View model logs');
719
- console.log(' pi prompt <name> <msg> Chat with a model\n');
1131
+ console.log(' pi stop [name] [--pod <pod-name>] Stop a model (or all if no name)');
1132
+ console.log(' pi logs <name> [--pod <pod-name>] View model logs');
1133
+ console.log(' pi prompt <name> <msg> [--pod <pod-name>] Chat with a model');
1134
+ console.log(' pi downloads [--pod <pod-name>] [--live] Check model download progress (--live for continuous monitoring)\n');
720
1135
  console.log('Start Options:');
721
1136
  console.log(' --name <name> Model alias (default: auto-generated)');
722
1137
  console.log(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k (default: model default)');
723
1138
  console.log(' --memory <percent> GPU memory: 30%, 50%, 90% (default: 90%)');
724
1139
  console.log(' --all-gpus Use all GPUs with tensor parallelism');
1140
+ console.log(' --pod <pod-name> Run on specific pod without switching active pod');
1141
+ console.log(' --debug Enable debug logging for vLLM');
725
1142
  console.log(' --vllm-args Pass remaining args directly to vLLM\n');
726
1143
  console.log('Utility:');
727
- console.log(' pi shell SSH into active pod');
1144
+ console.log(' pi shell [--pod <pod-name>] SSH into pod');
1145
+ console.log(' pi ssh [--pod <pod-name>] <cmd> Run SSH command on pod');
728
1146
 
729
1147
  console.log('\nQuick Examples:');
730
1148
  console.log(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen');
@@ -732,7 +1150,7 @@ class PrimeIntellectCLI {
732
1150
  console.log('\n # Qwen3-Coder on 8xH200 with custom vLLM args:');
733
1151
  console.log(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\');
734
1152
  console.log(' --data-parallel-size 8 --enable-expert-parallel \\');
735
- console.log(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --max-model-len 200000');
1153
+ console.log(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
736
1154
 
737
1155
  if (this.config.active && this.config.pods[this.config.active]) {
738
1156
  console.log(`\nActive pod: ${this.config.active} (${this.config.pods[this.config.active].ssh})`);
@@ -741,9 +1159,9 @@ class PrimeIntellectCLI {
741
1159
  }
742
1160
  }
743
1161
 
744
- getRunningModels() {
1162
+ getRunningModels(podName = null) {
745
1163
  try {
746
- const output = this.ssh('python3 vllm_manager.py list');
1164
+ const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
747
1165
  const models = {};
748
1166
 
749
1167
  // Parse the output to extract model info
@@ -817,9 +1235,18 @@ class PrimeIntellectCLI {
817
1235
  break;
818
1236
 
819
1237
  case 'list':
820
- case 'ls':
821
- this.list();
1238
+ case 'ls': {
1239
+ let podName = null;
1240
+
1241
+ // Parse --pod parameter
1242
+ const podIndex = args.indexOf('--pod');
1243
+ if (podIndex !== -1 && args[podIndex + 1]) {
1244
+ podName = args[podIndex + 1];
1245
+ }
1246
+
1247
+ this.list(podName);
822
1248
  break;
1249
+ }
823
1250
 
824
1251
  case 'search':
825
1252
  if (!args[0]) {
@@ -830,42 +1257,116 @@ class PrimeIntellectCLI {
830
1257
  await this.searchModels(args[0]);
831
1258
  break;
832
1259
 
1260
+ case 'downloads': {
1261
+ let podName = null;
1262
+ let live = false;
1263
+
1264
+ // Parse --pod parameter
1265
+ const podIndex = args.indexOf('--pod');
1266
+ if (podIndex !== -1 && args[podIndex + 1]) {
1267
+ podName = args[podIndex + 1];
1268
+ }
1269
+
1270
+ // Parse --live parameter
1271
+ if (args.includes('--live')) {
1272
+ live = true;
1273
+ }
1274
+
1275
+ await this.checkDownloads(podName, live);
1276
+ break;
1277
+ }
1278
+
833
1279
  case 'start':
834
1280
  await this.handleStart(args);
835
1281
  break;
836
1282
 
837
- case 'stop':
838
- this.stop(args[0]);
1283
+ case 'stop': {
1284
+ let modelName = args[0];
1285
+ let podName = null;
1286
+
1287
+ // Parse --pod parameter
1288
+ const podIndex = args.indexOf('--pod');
1289
+ if (podIndex !== -1 && args[podIndex + 1]) {
1290
+ podName = args[podIndex + 1];
1291
+ // Remove --pod and its value from args
1292
+ args.splice(podIndex, 2);
1293
+ modelName = args[0]; // Update modelName after removing --pod
1294
+ }
1295
+
1296
+ this.stop(modelName, podName);
839
1297
  break;
1298
+ }
1299
+
1300
+ case 'logs': {
1301
+ let modelName = args[0];
1302
+ let podName = null;
1303
+
1304
+ // Parse --pod parameter
1305
+ const podIndex = args.indexOf('--pod');
1306
+ if (podIndex !== -1 && args[podIndex + 1]) {
1307
+ podName = args[podIndex + 1];
1308
+ // Remove --pod and its value from args
1309
+ args.splice(podIndex, 2);
1310
+ modelName = args[0]; // Update modelName after removing --pod
1311
+ }
840
1312
 
841
- case 'logs':
842
- await this.logs(args[0], false); // autoExit = false for manual logs command
1313
+ await this.logs(modelName, false, podName); // autoExit = false for manual logs command
843
1314
  break;
1315
+ }
844
1316
 
845
1317
  case 'prompt': {
846
1318
  if (args.length < 2) {
847
- console.error('Usage: pi prompt <model_name> "<message>"');
1319
+ console.error('Usage: pi prompt <model_name> "<message>" [--pod <pod-name>]');
848
1320
  console.error('Example: pi prompt phi3 "Hey, how you going"');
849
1321
  process.exit(1);
850
1322
  }
851
- const modelName = args[0];
1323
+ let modelName = args[0];
1324
+ let podName = null;
1325
+
1326
+ // Parse --pod parameter
1327
+ const podIndex = args.indexOf('--pod');
1328
+ if (podIndex !== -1 && args[podIndex + 1]) {
1329
+ podName = args[podIndex + 1];
1330
+ // Remove --pod and its value from args
1331
+ args.splice(podIndex, 2);
1332
+ }
1333
+
852
1334
  const message = args.slice(1).join(' ');
853
- this.prompt(modelName, message);
1335
+ this.prompt(modelName, message, podName);
854
1336
  break;
855
1337
  }
856
- case 'shell':
857
- await this.shell();
1338
+ case 'shell': {
1339
+ let podName = null;
1340
+
1341
+ // Parse --pod parameter
1342
+ const podIndex = args.indexOf('--pod');
1343
+ if (podIndex !== -1 && args[podIndex + 1]) {
1344
+ podName = args[podIndex + 1];
1345
+ }
1346
+
1347
+ await this.shell(podName);
858
1348
  break;
1349
+ }
1350
+
1351
+ case 'ssh': {
1352
+ let podName = null;
1353
+ let sshArgs = [...args];
1354
+
1355
+ // For ssh, --pod must be the first parameter if present
1356
+ if (args[0] === '--pod' && args[1]) {
1357
+ podName = args[1];
1358
+ sshArgs = args.slice(2); // Remove --pod and podName from args
1359
+ }
859
1360
 
860
- case 'ssh':
861
1361
  // Pass through any SSH command
862
- if (args.length > 0) {
863
- const output = this.ssh(args.join(' '));
1362
+ if (sshArgs.length > 0) {
1363
+ const output = this.ssh(sshArgs.join(' '), false, false, podName);
864
1364
  console.log(output);
865
1365
  } else {
866
- this.shell();
1366
+ await this.shell(podName);
867
1367
  }
868
1368
  break;
1369
+ }
869
1370
 
870
1371
  default:
871
1372
  this.showHelp();
@@ -874,5 +1375,5 @@ class PrimeIntellectCLI {
874
1375
  }
875
1376
 
876
1377
  // Run CLI
877
- const cli = new PrimeIntellectCLI();
1378
+ const cli = new PiCli();
878
1379
  cli.run().catch(console.error);