@mariozechner/pi 0.2.4 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/pi.js DELETED
@@ -1,1379 +0,0 @@
1
- #!/usr/bin/env node
2
- /**
3
- * pi CLI
4
- */
5
-
6
- const fs = require('fs');
7
- const { execSync, spawn } = require('child_process');
8
- const path = require('path');
9
- const os = require('os');
10
-
11
- const CONFIG_FILE = path.join(os.homedir(), '.pi_config');
12
- const SCRIPT_DIR = __dirname;
13
-
14
- class PiCli {
15
- constructor() {
16
- this.loadConfig();
17
- }
18
-
19
- loadConfig() {
20
- if (fs.existsSync(CONFIG_FILE)) {
21
- this.config = JSON.parse(fs.readFileSync(CONFIG_FILE, 'utf8'));
22
- // Migrate old single-pod config
23
- if (this.config.ssh && !this.config.pods) {
24
- this.config = {
25
- pods: { 'default': { ssh: this.config.ssh } },
26
- active: 'default'
27
- };
28
- this.saveConfig();
29
- }
30
- } else {
31
- this.config = { pods: {}, active: null };
32
- }
33
- }
34
-
35
- saveConfig() {
36
- fs.writeFileSync(CONFIG_FILE, JSON.stringify(this.config, null, 2));
37
- }
38
-
39
- getActivePod() {
40
- if (!this.config.active || !this.config.pods[this.config.active]) {
41
- return null;
42
- }
43
- return this.config.pods[this.config.active];
44
- }
45
-
46
- ssh(command, interactive = false, skipPirc = false, podName = null) {
47
- const pod = podName ? this.config.pods[podName] : this.getActivePod();
48
- if (!pod) {
49
- if (podName) {
50
- console.error(`Pod '${podName}' not found`);
51
- console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
52
- } else {
53
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
54
- console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
55
- console.error('Or activate an existing pod: pi pod <pod-name>');
56
- }
57
- process.exit(1);
58
- }
59
-
60
- // Wrap command to source .pirc first (if it exists), unless skipPirc is true
61
- const finalCommand = skipPirc ? command : `[ -f ~/.pirc ] && source ~/.pirc; ${command}`;
62
-
63
- if (interactive) {
64
- // For interactive commands, use spawn with shell
65
- const sshParts = pod.ssh.split(' ');
66
- const sshCmd = ['ssh', ...sshParts, finalCommand];
67
- const proc = spawn(sshCmd[0], sshCmd.slice(1), { stdio: 'inherit', shell: false });
68
- return new Promise((resolve) => {
69
- proc.on('close', resolve);
70
- });
71
- } else {
72
- const sshCmd = `ssh ${pod.ssh} ${JSON.stringify(finalCommand)}`;
73
-
74
- // For non-interactive, use execSync
75
- try {
76
- return execSync(sshCmd, { encoding: 'utf8' });
77
- } catch (e) {
78
- if (e.status !== 0) {
79
- console.error('SSH command failed:', e.message);
80
- process.exit(1);
81
- }
82
- throw e;
83
- }
84
- }
85
- }
86
-
87
- scp(localFile, remotePath = '~/', podName = null) {
88
- const pod = podName ? this.config.pods[podName] : this.getActivePod();
89
- if (!pod) {
90
- if (podName) {
91
- console.error(`Pod '${podName}' not found`);
92
- } else {
93
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
94
- }
95
- process.exit(1);
96
- }
97
-
98
- const [userHost, ...sshArgs] = pod.ssh.split(' ');
99
- let scpCmd = `scp`;
100
-
101
- // Add port if specified
102
- const portArg = sshArgs.find(arg => arg === '-p');
103
- if (portArg) {
104
- const portIndex = sshArgs.indexOf(portArg);
105
- const port = sshArgs[portIndex + 1];
106
- scpCmd += ` -P ${port}`;
107
- }
108
-
109
- scpCmd += ` ${localFile} ${userHost}:${remotePath}`;
110
-
111
- try {
112
- execSync(scpCmd, { stdio: 'inherit' });
113
- } catch (e) {
114
- console.error('SCP failed:', e.message);
115
- process.exit(1);
116
- }
117
- }
118
-
119
- async setup(podName, sshCommand) {
120
- if (!podName || !sshCommand) {
121
- console.error('Usage: pi setup <pod-name> <ssh_command>');
122
- console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
123
- process.exit(1);
124
- }
125
-
126
- // Remove "ssh " prefix if present
127
- if (sshCommand.toLowerCase().startsWith('ssh ')) {
128
- sshCommand = sshCommand.substring(4);
129
- }
130
-
131
- // Save pod config
132
- if (!this.config.pods) {
133
- this.config.pods = {};
134
- }
135
- this.config.pods[podName] = { ssh: sshCommand };
136
- this.config.active = podName;
137
- this.saveConfig();
138
- console.log(`Saved pod '${podName}' with SSH: ${sshCommand}`);
139
-
140
- // Test connection
141
- console.log('\nTesting SSH connection...');
142
- try {
143
- const hostname = this.ssh('hostname', false, true).trim();
144
- console.log(`✓ Connected to ${hostname}`);
145
- } catch (e) {
146
- console.error('✗ SSH connection failed');
147
- process.exit(1);
148
- }
149
-
150
- // Copy setup files
151
- console.log('\nCopying setup files...');
152
- this.scp(path.join(SCRIPT_DIR, 'pod_setup.sh'));
153
- this.scp(path.join(SCRIPT_DIR, 'vllm_manager.py'));
154
-
155
- // Run setup with HF_TOKEN
156
- console.log('\nRunning setup script...');
157
- const hfToken = process.env.HF_TOKEN;
158
- if (!hfToken) {
159
- console.error('\nERROR: HF_TOKEN environment variable not set');
160
- console.error('Please export HF_TOKEN before running setup');
161
- process.exit(1);
162
- }
163
- await this.ssh(`export HF_TOKEN="${hfToken}" && bash pod_setup.sh`, true, true);
164
-
165
- console.log('\n✓ Setup complete!');
166
-
167
- // Show usage help
168
- this.showHelp();
169
- }
170
-
171
- list(podName = null) {
172
- const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
173
- console.log(output);
174
- }
175
-
176
- parseContextSize(value) {
177
- if (!value) return 8192;
178
-
179
- // Convert string to lowercase for case-insensitive matching
180
- const lower = value.toString().toLowerCase();
181
-
182
- // Handle 'k' suffix (4k, 8k, 32k, etc)
183
- if (lower.endsWith('k')) {
184
- return parseInt(lower.slice(0, -1)) * 1024;
185
- }
186
-
187
- // Handle plain numbers
188
- return parseInt(value);
189
- }
190
-
191
- parseMemory(value) {
192
- if (!value) return 0.9;
193
-
194
- const str = value.toString().toLowerCase();
195
-
196
- // Handle percentage (30%, 50%, etc)
197
- if (str.endsWith('%')) {
198
- return parseInt(str.slice(0, -1)) / 100;
199
- }
200
-
201
- // Handle decimal (0.3, 0.5, etc)
202
- const num = parseFloat(str);
203
- if (num > 1) {
204
- console.error('Memory must be between 0-1 or 0-100%');
205
- process.exit(1);
206
- }
207
- return num;
208
- }
209
-
210
- async handleStart(args) {
211
- if (!args[0]) {
212
- console.error('Usage: pi start <model> [options]');
213
- console.error('');
214
- console.error('Options:');
215
- console.error(' --name <name> Model alias (default: auto-generated)');
216
- console.error(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k or 4096, 8192, etc (default: model default)');
217
- console.error(' --memory <percent> GPU memory: 30%, 50%, 90% or 0.3, 0.5, 0.9 (default: 90%)');
218
- console.error(' --all-gpus Use all GPUs with tensor parallelism (ignores --memory)');
219
- console.error(' --debug Enable debug logging for vLLM');
220
- console.error(' --pod <name> Run on specific pod (default: active pod)');
221
- console.error(' --vllm-args Pass remaining args directly to vLLM (ignores other options)');
222
- console.error('');
223
- console.error('Examples:');
224
- console.error(' pi start Qwen/Qwen2.5-7B-Instruct');
225
- console.error(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen --memory 20%');
226
- console.error(' pi start meta-llama/Llama-3.1-70B-Instruct --all-gpus');
227
- console.error(' pi start meta-llama/Llama-3.1-405B --all-gpus --context 128k');
228
- console.error('');
229
- console.error(' # Custom vLLM args for Qwen3-Coder on 8xH200:');
230
- console.error(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\\\');
231
- console.error(' --data-parallel-size 8 --enable-expert-parallel \\\\');
232
- console.error(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
233
- process.exit(1);
234
- }
235
-
236
- const modelId = args[0];
237
- let name = null;
238
- let context = null; // Changed to null - let vLLM use model default
239
- let memory = 0.9;
240
- let allGpus = false;
241
- let debug = false;
242
- let vllmArgs = null;
243
- let podName = null;
244
-
245
- // Check for --vllm-args first
246
- const vllmArgsIndex = args.indexOf('--vllm-args');
247
- if (vllmArgsIndex !== -1) {
248
- // Extract name and pod if provided before --vllm-args
249
- for (let i = 1; i < vllmArgsIndex; i++) {
250
- if (args[i] === '--name' && args[i + 1]) {
251
- name = args[++i];
252
- } else if (args[i] === '--pod' && args[i + 1]) {
253
- podName = args[++i];
254
- } else if (args[i] === '--debug') {
255
- debug = true;
256
- }
257
- }
258
- // Everything after --vllm-args is passed to vLLM
259
- vllmArgs = args.slice(vllmArgsIndex + 1).join(' ');
260
- } else {
261
- // Parse normal arguments
262
- for (let i = 1; i < args.length; i++) {
263
- switch (args[i]) {
264
- case '--name':
265
- name = args[++i];
266
- break;
267
- case '--context':
268
- context = this.parseContextSize(args[++i]);
269
- break;
270
- case '--memory':
271
- memory = this.parseMemory(args[++i]);
272
- break;
273
- case '--all-gpus':
274
- allGpus = true;
275
- break;
276
- case '--debug':
277
- debug = true;
278
- break;
279
- case '--pod':
280
- podName = args[++i];
281
- break;
282
- default:
283
- console.error(`Unknown option: ${args[i]}`);
284
- process.exit(1);
285
- }
286
- }
287
- }
288
-
289
- // Check for multi-GPU setup
290
- const gpuCount = await this.getGpuCount(podName);
291
-
292
- if (allGpus) {
293
- if (memory !== 0.9) {
294
- console.log('Warning: --memory ignored with --all-gpus (using 95% memory across all GPUs)');
295
- }
296
- memory = 0.95;
297
-
298
- if (gpuCount === 1) {
299
- console.log('Note: --all-gpus specified but only 1 GPU found');
300
- allGpus = false;
301
- }
302
- }
303
-
304
- // Auto-generate name if not provided
305
- if (!name) {
306
- // Extract model name from path (e.g., "Phi-3-mini" from "microsoft/Phi-3-mini-4k-instruct")
307
- const parts = modelId.split('/');
308
- const modelName = parts[parts.length - 1];
309
- name = modelName.toLowerCase()
310
- .replace(/-instruct$/, '')
311
- .replace(/-chat$/, '')
312
- .replace(/[^a-z0-9-]/g, '-')
313
- .replace(/-+/g, '-')
314
- .replace(/^-|-$/g, '')
315
- .slice(0, 20);
316
- }
317
-
318
- // If vllmArgs provided, skip memory check since we don't know the parallelism strategy
319
- if (vllmArgs) {
320
- const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
321
- if (modelEstimate) {
322
- console.log(`Model weights: ${modelEstimate.modelSizeGB.toFixed(1)}GB`);
323
- console.log(`Context length: ${modelEstimate.contextLength.toLocaleString()} tokens`);
324
- }
325
- console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
326
- await this.startRaw(modelId, name, vllmArgs, debug, podName);
327
- return;
328
- }
329
-
330
- // For standard deployment, check memory
331
- const modelEstimate = await this.getModelMemoryEstimate(modelId, context);
332
-
333
- // Check GPU memory before starting
334
- console.log('Checking model size and GPU memory...');
335
- console.log(`Target pod: ${podName || this.config.active || 'active pod'}`);
336
- const [memoryInfo, modelEstimateWithContext] = await Promise.all([
337
- this.getGpuMemoryInfo(podName),
338
- modelEstimate
339
- ]);
340
-
341
- if (memoryInfo && modelEstimateWithContext) {
342
- // For tensor parallel (--all-gpus), memory is distributed across GPUs
343
- const effectiveMemoryNeeded = allGpus && gpuCount > 1
344
- ? modelEstimateWithContext.estimatedMemoryGB / gpuCount
345
- : modelEstimateWithContext.estimatedMemoryGB;
346
-
347
- const memoryPerGpu = memoryInfo.freeMemoryGB / (gpuCount || 1);
348
-
349
- console.log(`Model weights: ${modelEstimateWithContext.modelSizeGB.toFixed(1)}GB`);
350
- console.log(`Context length: ${modelEstimateWithContext.contextLength.toLocaleString()} tokens`);
351
- console.log(`Note: Estimate includes model parameters only, not KV cache for context`);
352
- console.log(`Available GPU memory: ${memoryInfo.freeMemoryGB.toFixed(1)}GB total (${memoryPerGpu.toFixed(1)}GB per GPU)`);
353
-
354
- if (effectiveMemoryNeeded > memoryPerGpu) {
355
- // Log a BIG WARNING as requested
356
- console.error(`\n❌ BIG WARNING: Insufficient GPU memory`);
357
- if (allGpus && gpuCount > 1) {
358
- console.error(` Model needs ~${effectiveMemoryNeeded.toFixed(1)}GB per GPU but only ${memoryPerGpu.toFixed(1)}GB available`);
359
- } else {
360
- console.error(` Model needs ~${modelEstimateWithContext.estimatedMemoryGB.toFixed(1)}GB but only ${memoryInfo.freeMemoryGB.toFixed(1)}GB available`);
361
- }
362
- console.error('\n Free up memory by stopping running models:');
363
- console.error(' pi list # See running models');
364
- console.error(' pi stop <model_name> # Stop specific model');
365
- console.error(' pi stop # Stop all models\n');
366
- // Don't exit, just warn and proceed
367
- }
368
- }
369
-
370
- // Call the original start method with positional args
371
- const contextStr = context ? context.toString() : null;
372
- await this.start(modelId, name, contextStr, memory.toString(), { allGpus, gpuCount, debug, podName });
373
- }
374
-
375
- async getGpuCount(podName = null) {
376
- try {
377
- const output = this.ssh('nvidia-smi --query-gpu=name --format=csv,noheader | wc -l', false, false, podName);
378
- return parseInt(output.trim()) || 1;
379
- } catch {
380
- return 1;
381
- }
382
- }
383
-
384
- async getGpuMemoryInfo(podName = null) {
385
- try {
386
- const output = this.ssh('nvidia-smi --query-gpu=memory.total,memory.free --format=csv,noheader,nounits', false, false, podName);
387
- const lines = output.trim().split('\n');
388
- let totalMemoryGB = 0;
389
- let freeMemoryGB = 0;
390
-
391
- for (const line of lines) {
392
- const [total, free] = line.split(',').map(x => parseInt(x.trim()));
393
- totalMemoryGB += total / 1024;
394
- freeMemoryGB += free / 1024;
395
- }
396
-
397
- return { totalMemoryGB, freeMemoryGB };
398
- } catch (e) {
399
- return null;
400
- }
401
- }
402
-
403
- async getModelMemoryEstimate(modelId, contextLength = null) {
404
- try {
405
- const response = await fetch(`https://huggingface.co/api/models/${modelId}`);
406
- const data = await response.json();
407
-
408
- if (data.safetensors?.parameters) {
409
- // Calculate actual model size based on parameter counts and types
410
- const dtypeSizes = {
411
- 'F64': 8, // float64 - 8 bytes
412
- 'F32': 4, // float32 - 4 bytes
413
- 'BF16': 2, // bfloat16 - 2 bytes
414
- 'F16': 2, // float16 - 2 bytes
415
- 'I32': 4, // int32 - 4 bytes
416
- 'I16': 2, // int16 - 2 bytes
417
- 'I8': 1, // int8 - 1 byte
418
- 'U8': 1, // uint8 - 1 byte
419
- 'I4': 0.5, // int4 - 0.5 bytes (packed)
420
- 'F8_E4M3': 1, // FP8 E4M3 format - 1 byte
421
- 'F8_E5M2': 1, // FP8 E5M2 format - 1 byte
422
- 'Q8_0': 1, // GGML quantization formats
423
- 'Q4_0': 0.5, // GGML quantization formats
424
- 'Q4_1': 0.5, // GGML quantization formats
425
- 'Q5_0': 0.625, // GGML quantization formats
426
- 'Q5_1': 0.625 // GGML quantization formats
427
- };
428
-
429
- let totalBytes = 0;
430
- let paramDetails = [];
431
-
432
- // Calculate bytes for each dtype
433
- let unknownDtypes = [];
434
- for (const [dtype, paramCount] of Object.entries(data.safetensors.parameters)) {
435
- let bytesPerParam = dtypeSizes[dtype];
436
- if (bytesPerParam === undefined) {
437
- // Unknown dtype - assume 1 byte (most new formats are quantized)
438
- bytesPerParam = 1; // Conservative for memory checking
439
- unknownDtypes.push(dtype);
440
- }
441
- const bytes = paramCount * bytesPerParam;
442
- totalBytes += bytes;
443
- paramDetails.push({ dtype, count: paramCount, bytes });
444
- }
445
-
446
- if (unknownDtypes.length > 0) {
447
- console.warn(`Unknown dtype(s) found: ${unknownDtypes.join(', ')}. Assuming 1 byte per parameter.`);
448
- }
449
-
450
- const modelSizeGB = totalBytes / (1024 ** 3);
451
-
452
- // Try to get model config for context length
453
- let maxContextLength = contextLength;
454
- try {
455
- const configResponse = await fetch(`https://huggingface.co/${modelId}/raw/main/config.json`);
456
- if (configResponse.ok) {
457
- const config = await configResponse.json();
458
- maxContextLength = contextLength || config.max_position_embeddings || 8192;
459
- }
460
- } catch (e) {
461
- maxContextLength = contextLength || 8192;
462
- }
463
-
464
- return {
465
- modelSizeGB,
466
- estimatedMemoryGB: modelSizeGB, // Only model weights, not KV cache
467
- contextLength: maxContextLength,
468
- paramDetails // For debugging
469
- };
470
- }
471
-
472
- return null;
473
- } catch (e) {
474
- return null;
475
- }
476
- }
477
-
478
- async start(modelId, name, maxLen = null, gpuMemory, options = {}) {
479
- // Check if name is already in use locally first
480
- if (name) {
481
- const runningModels = this.getRunningModels(options.podName);
482
- if (runningModels[name]) {
483
- console.error(`Error: Model name '${name}' is already in use`);
484
- console.error('Running models:', Object.keys(runningModels).join(', '));
485
- process.exit(1);
486
- }
487
- }
488
-
489
- // Memory check is already done in handleStart, skip it here
490
-
491
- // Build args for vllm_manager.py
492
- let args = modelId;
493
-
494
- // Handle optional parameters
495
- if (name || maxLen || gpuMemory) {
496
- args += ` ${name || '""'}`;
497
-
498
- if (maxLen || gpuMemory) {
499
- args += ` ${maxLen || '""'}`; // Pass empty string to use vLLM default
500
-
501
- if (gpuMemory) {
502
- args += ` ${gpuMemory}`;
503
- }
504
- }
505
- }
506
-
507
- // Handle multi-GPU options
508
- let envPrefix = '';
509
- if (options.allGpus && options.gpuCount > 1) {
510
- args += ` ${options.gpuCount}`; // Pass tensor parallel size
511
- }
512
-
513
- // Add debug logging if requested
514
- if (options.debug) {
515
- envPrefix = 'VLLM_LOGGING_LEVEL=DEBUG ';
516
- }
517
-
518
- const output = this.ssh(`${envPrefix}python3 vllm_manager.py start ${args}`, false, false, options.podName);
519
-
520
- // Extract model name and connection info from output
521
- const nameMatch = output.match(/Started (\S+)/);
522
- const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
523
- const exportMatch = output.match(/export OPENAI_BASE_URL='([^']+)'/);
524
-
525
- if (nameMatch) {
526
- const modelName = nameMatch[1];
527
- const url = urlMatch ? urlMatch[1] : null;
528
- const exportCmd = exportMatch ? `export OPENAI_BASE_URL='${exportMatch[1]}'` : null;
529
-
530
- console.log(`\nStarted ${modelName}`);
531
- console.log('Waiting for model to initialize...\n');
532
-
533
- // Set up Ctrl+C handler for manual interruption
534
- const showModelInfo = () => {
535
- console.log('\n\n' + '='.repeat(60));
536
- console.log('Model Information:');
537
- console.log('='.repeat(60));
538
- console.log(`Name: ${modelName}`);
539
- if (url) console.log(`URL: ${url}`);
540
- if (exportCmd) {
541
- console.log(`\nTo use with OpenAI clients:`);
542
- console.log(exportCmd);
543
- console.log(`export OPENAI_API_KEY='dummy'`);
544
- console.log(`export OPENAI_MODEL='${modelId}'`);
545
- }
546
- console.log('='.repeat(60));
547
- };
548
-
549
- process.on('SIGINT', () => {
550
- showModelInfo();
551
- process.exit(0);
552
- });
553
-
554
- // Watch logs until startup complete
555
- await this.logs(modelName, true, options.podName); // autoExit = true for startup
556
-
557
- // Warm up the model with a simple prompt
558
- console.log('\nWarming up model...');
559
- try {
560
- const warmupUrl = `${url}/chat/completions`;
561
- const warmupPayload = {
562
- model: modelId,
563
- messages: [{ role: 'user', content: 'Hi' }],
564
- max_tokens: 1,
565
- temperature: 0
566
- };
567
-
568
- const warmupResponse = await fetch(warmupUrl, {
569
- method: 'POST',
570
- headers: { 'Content-Type': 'application/json' },
571
- body: JSON.stringify(warmupPayload)
572
- });
573
-
574
- if (warmupResponse.ok) {
575
- console.log('✓ Model warmed up and ready!');
576
- } else {
577
- console.log('⚠ Warmup failed, but model should still work');
578
- }
579
- } catch (e) {
580
- console.log('⚠ Could not warm up model:', e.message);
581
- }
582
-
583
- // Show model info after warmup
584
- showModelInfo();
585
- } else {
586
- console.log(output);
587
- }
588
- }
589
-
590
- async startRaw(modelId, name, vllmArgs, debug = false, podName = null) {
591
- // Skip memory check for raw vLLM args since we don't know what custom settings are used
592
- console.log('Note: Memory checking disabled when using --vllm-args');
593
- // Check if name is already in use
594
- const runningModels = this.getRunningModels(podName);
595
- if (runningModels[name]) {
596
- console.error(`Error: Model name '${name}' is already in use`);
597
- console.error('Running models:', Object.keys(runningModels).join(', '));
598
- process.exit(1);
599
- }
600
-
601
- console.log(`Starting ${name} with custom vLLM args on pod: ${podName || this.config.active || 'active pod'}`);
602
-
603
- // Start vLLM with raw arguments - use base64 to safely pass complex args
604
- const base64Args = Buffer.from(vllmArgs).toString('base64');
605
- const envPrefix = debug ? 'VLLM_LOGGING_LEVEL=DEBUG ' : '';
606
- const output = this.ssh(`${envPrefix}python3 vllm_manager.py start_raw "${modelId}" "${name}" "${base64Args}"`, false, false, podName);
607
-
608
- // Extract connection info from output
609
- const urlMatch = output.match(/URL: (http:\/\/[^\s]+)/);
610
- const exportMatch = output.match(/export OPENAI_BASE_URL='([^']+)'/);
611
-
612
- if (urlMatch || exportMatch) {
613
- const url = urlMatch ? urlMatch[1] : null;
614
- const exportCmd = exportMatch ? `export OPENAI_BASE_URL='${exportMatch[1]}'` : null;
615
-
616
- console.log(`\nStarted ${name}`);
617
- console.log('Waiting for model to initialize...\n');
618
-
619
- // Set up Ctrl+C handler for manual interruption
620
- const showModelInfo = () => {
621
- console.log('\n\n' + '='.repeat(60));
622
- console.log('Model Information:');
623
- console.log('='.repeat(60));
624
- console.log(`Name: ${name}`);
625
- if (url) console.log(`URL: ${url}`);
626
- if (exportCmd) {
627
- console.log(`\nTo use with OpenAI clients:`);
628
- console.log(exportCmd);
629
- console.log(`export OPENAI_API_KEY='dummy'`);
630
- console.log(`export OPENAI_MODEL='${modelId}'`);
631
- }
632
- console.log('='.repeat(60));
633
- };
634
-
635
- process.on('SIGINT', () => {
636
- showModelInfo();
637
- process.exit(0);
638
- });
639
-
640
- // Watch logs until startup complete
641
- await this.logs(name, true, podName); // autoExit = true for startup
642
-
643
- // Warm up the model with a simple prompt
644
- console.log('\nWarming up model...');
645
- try {
646
- const warmupUrl = `${url}/chat/completions`;
647
- const warmupPayload = {
648
- model: modelId,
649
- messages: [{ role: 'user', content: 'Hi' }],
650
- max_tokens: 1,
651
- temperature: 0
652
- };
653
-
654
- const warmupResponse = await fetch(warmupUrl, {
655
- method: 'POST',
656
- headers: { 'Content-Type': 'application/json' },
657
- body: JSON.stringify(warmupPayload)
658
- });
659
-
660
- if (warmupResponse.ok) {
661
- console.log('✓ Model warmed up and ready!');
662
- } else {
663
- console.log('⚠ Warmup failed, but model should still work');
664
- }
665
- } catch (e) {
666
- console.log('⚠ Could not warm up model:', e.message);
667
- }
668
-
669
- // Show model info after warmup
670
- showModelInfo();
671
- } else {
672
- console.log(output);
673
- }
674
- }
675
-
676
- stop(name, podName = null) {
677
- if (!name) {
678
- // Stop all models
679
- const runningModels = this.getRunningModels(podName);
680
- const modelNames = Object.keys(runningModels);
681
-
682
- if (modelNames.length === 0) {
683
- console.log('No models running');
684
- // Still clean up any hanging vLLM processes
685
- console.log('Cleaning up any remaining vLLM processes...');
686
- this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
687
- return;
688
- }
689
-
690
- console.log(`Stopping ${modelNames.length} model(s): ${modelNames.join(', ')}`);
691
-
692
- for (const modelName of modelNames) {
693
- const output = this.ssh(`python3 vllm_manager.py stop ${modelName}`, false, false, podName);
694
- console.log(output);
695
- }
696
-
697
- // Final cleanup of vLLM processes after stopping all models
698
- console.log('Ensuring all vLLM processes are terminated...');
699
- this.ssh("ps aux | grep -E 'python.*vllm' | grep -v grep | grep -v vllm_manager.py | awk '{print $2}' | xargs -r kill -9 2>/dev/null || true", false, false, podName);
700
- } else {
701
- // Stop specific model
702
- const output = this.ssh(`python3 vllm_manager.py stop ${name}`, false, false, podName);
703
- console.log(output);
704
- }
705
- }
706
-
707
- async logs(name, autoExit = false, podName = null) {
708
- if (!name) {
709
- console.error('Usage: pi logs <name>');
710
- process.exit(1);
711
- }
712
-
713
- // Use vllm_manager.py to get the log file path
714
- const infoOutput = this.ssh(`python3 vllm_manager.py list`, false, false, podName);
715
-
716
- // Extract log file path from the output
717
- const lines = infoOutput.split('\n');
718
- let logFile = null;
719
- let inModel = false;
720
-
721
- for (const line of lines) {
722
- if (line.startsWith(`${name}:`)) {
723
- inModel = true;
724
- } else if (inModel && line.includes('Logs:')) {
725
- logFile = line.split('Logs:')[1].trim();
726
- break;
727
- }
728
- }
729
-
730
- if (!logFile) {
731
- console.error(`No logs found for ${name}`);
732
- process.exit(1);
733
- }
734
-
735
- // Use a custom tail that watches for startup complete
736
- const pod = podName ? this.config.pods[podName] : this.getActivePod();
737
- // Add SSH options to prevent connection issues
738
- const sshOpts = '-o ServerAliveInterval=5 -o ServerAliveCountMax=3 -o TCPKeepAlive=yes';
739
- const sshCmd = `ssh ${sshOpts} ${pod.ssh} tail -n 50 -f ${logFile}`;
740
-
741
- return new Promise((resolve) => {
742
- const [cmd, ...args] = sshCmd.split(' ');
743
- const proc = spawn(cmd, args, { stdio: ['inherit', 'pipe', 'pipe'] });
744
-
745
- let buffer = '';
746
-
747
- proc.stdout.on('data', (data) => {
748
- process.stdout.write(data);
749
- buffer += data.toString();
750
-
751
- // Only check for startup messages if autoExit is enabled
752
- if (autoExit) {
753
- if (buffer.includes('Application startup complete.') ||
754
- buffer.includes('Uvicorn running on')) {
755
- setTimeout(() => {
756
- proc.kill();
757
- resolve();
758
- }, 500); // Small delay to ensure final messages are shown
759
- }
760
- }
761
-
762
- // Keep buffer size manageable
763
- if (buffer.length > 10000) {
764
- buffer = buffer.slice(-5000);
765
- }
766
- });
767
-
768
- proc.stderr.on('data', (data) => {
769
- process.stderr.write(data);
770
- });
771
-
772
- proc.on('close', () => {
773
- resolve();
774
- });
775
- });
776
- }
777
-
778
- async shell(podName = null) {
779
- const pod = podName ? this.config.pods[podName] : this.getActivePod();
780
- if (!pod) {
781
- if (podName) {
782
- console.error(`Pod '${podName}' not found`);
783
- console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
784
- } else {
785
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
786
- }
787
- process.exit(1);
788
- }
789
-
790
- console.log(`Connecting to pod${podName ? ` '${podName}'` : ''}...`);
791
-
792
- // Use spawn directly for interactive shell
793
- const sshParts = pod.ssh.split(' ');
794
- const sshCmd = ['ssh', ...sshParts];
795
- const proc = spawn(sshCmd[0], sshCmd.slice(1), { stdio: 'inherit' });
796
-
797
- return new Promise((resolve) => {
798
- proc.on('close', resolve);
799
- });
800
- }
801
-
802
- listPods() {
803
- if (!this.config.pods || Object.keys(this.config.pods).length === 0) {
804
- console.log('No pods configured. Run: pi setup <pod-name> <ssh_command>');
805
- return;
806
- }
807
-
808
- console.log('Configured pods:\n');
809
-
810
- // Show active pod first
811
- if (this.config.active && this.config.pods[this.config.active]) {
812
- console.log(`● ${this.config.active} (active)`);
813
- console.log(` ${this.config.pods[this.config.active].ssh}\n`);
814
- }
815
-
816
- // Show other pods
817
- Object.keys(this.config.pods).sort().forEach(name => {
818
- if (name !== this.config.active) {
819
- console.log(`○ ${name}`);
820
- console.log(` ${this.config.pods[name].ssh}`);
821
- }
822
- });
823
- }
824
-
825
- switchPod(podName) {
826
- if (!this.config.pods || !this.config.pods[podName]) {
827
- console.error(`Pod '${podName}' not found`);
828
- console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
829
- process.exit(1);
830
- }
831
-
832
- this.config.active = podName;
833
- this.saveConfig();
834
- console.log(`Switched to pod: ${podName} (${this.config.pods[podName].ssh})`);
835
- }
836
-
837
- removePod(podName) {
838
- if (!this.config.pods || !this.config.pods[podName]) {
839
- console.error(`Pod '${podName}' not found`);
840
- console.error('Available pods:', Object.keys(this.config.pods || {}).join(', ') || 'none');
841
- process.exit(1);
842
- }
843
-
844
- delete this.config.pods[podName];
845
-
846
- // If we removed the active pod, clear it or switch to another
847
- if (this.config.active === podName) {
848
- const remainingPods = Object.keys(this.config.pods);
849
- this.config.active = remainingPods.length > 0 ? remainingPods[0] : null;
850
- }
851
-
852
- this.saveConfig();
853
- console.log(`Removed pod: ${podName}`);
854
- if (this.config.active) {
855
- console.log(`Active pod is now: ${this.config.active}`);
856
- }
857
- }
858
-
859
- async searchModels(query) {
860
- console.log(`Searching HuggingFace for models matching "${query}"...\n`);
861
-
862
- try {
863
- const response = await fetch(`https://huggingface.co/api/models?search=${query}&filter=text-generation&sort=downloads&limit=20`);
864
- const data = await response.json();
865
-
866
- if (!data || data.length === 0) {
867
- console.log('No models found');
868
- return;
869
- }
870
-
871
- // Format results
872
- console.log('Popular models (sorted by downloads):\n');
873
- for (const model of data) {
874
- const modelName = model.modelId.toLowerCase();
875
-
876
- // Skip incompatible formats
877
- if (modelName.includes('-mlx-') || modelName.includes('-mlx')) {
878
- continue; // MLX is for Apple Silicon only
879
- }
880
- if (modelName.includes('-gguf') || modelName.includes('.gguf')) {
881
- continue; // GGUF is for llama.cpp, not vLLM
882
- }
883
-
884
- const downloads = model.downloads || 0;
885
- const likes = model.likes || 0;
886
-
887
- console.log(`\x1b[1m${model.modelId}\x1b[0m`); // Bold
888
- console.log(` \x1b[36mhttps://huggingface.co/${model.modelId}\x1b[0m`); // Cyan for URL
889
- console.log(` Downloads: ${downloads.toLocaleString()} | Likes: ${likes}`);
890
-
891
- // Check for quantization
892
- if (modelName.includes('-fp8') || modelName.includes('fp8-')) {
893
- console.log(` \x1b[33mNote: FP8 quantized - requires GPU with FP8 support\x1b[0m`);
894
- }
895
-
896
- console.log(` pi start ${model.modelId}`);
897
- console.log();
898
- }
899
-
900
- // Add HuggingFace search URL
901
- console.log(`\nView more models on HuggingFace:`);
902
- console.log(`\x1b[36mhttps://huggingface.co/models?search=${encodeURIComponent(query)}&sort=downloads&pipeline_tag=text-generation\x1b[0m`);
903
- } catch (error) {
904
- console.error('Error searching models:', error.message);
905
- }
906
- }
907
-
908
- async checkDownloads(podName = null, live = false) {
909
- // Check only active pod or specified pod
910
- const targetPod = podName || this.config.active;
911
- if (!targetPod || !this.config.pods[targetPod]) {
912
- console.error('No active pod. Run: pi setup <pod-name> <ssh_command>');
913
- process.exit(1);
914
- }
915
-
916
- if (!live) {
917
- // Single check mode
918
- console.log(`Checking model downloads on pod: ${targetPod}\n`);
919
- const output = this.ssh('python3 vllm_manager.py downloads', false, false, targetPod);
920
-
921
- if (output.includes('No HuggingFace cache found') || output.includes('No models in cache')) {
922
- console.log(output);
923
- return;
924
- }
925
-
926
- // Parse and display
927
- const downloadInfo = JSON.parse(output);
928
- this._displayDownloadInfo(downloadInfo);
929
- } else {
930
- // Live streaming mode
931
- const pod = this.config.pods[targetPod];
932
- // Build SSH command with proper shell invocation
933
- const sshParts = pod.ssh.split(' ');
934
- const remoteCmd = 'source .pirc && python3 vllm_manager.py downloads --stream';
935
-
936
- return new Promise((resolve) => {
937
- const proc = spawn('ssh', [...sshParts, remoteCmd], { stdio: ['inherit', 'pipe', 'pipe'] });
938
-
939
- let buffer = '';
940
-
941
- // Handle Ctrl+C gracefully
942
- process.on('SIGINT', () => {
943
- console.log('\n\nStopping download monitor...');
944
- proc.kill('SIGTERM'); // Send SIGTERM to remote process
945
- setTimeout(() => {
946
- proc.kill('SIGKILL'); // Force kill if not terminated
947
- process.exit(0);
948
- }, 1000);
949
- });
950
-
951
- // Print header once
952
- console.log(`Monitoring model downloads on pod: ${targetPod} (Press Ctrl+C to stop)`);
953
- console.log(); // Empty line after header
954
-
955
- // Hide cursor
956
- process.stdout.write('\x1B[?25l');
957
-
958
- // Ensure cursor is shown again on exit
959
- const cleanup = () => {
960
- process.stdout.write('\x1B[?25h');
961
- };
962
- process.on('exit', cleanup);
963
- process.on('SIGINT', cleanup);
964
-
965
- let previousLineCount = 0;
966
-
967
- proc.stdout.on('data', (data) => {
968
- buffer += data.toString();
969
-
970
- // Process complete lines
971
- const lines = buffer.split('\n');
972
- buffer = lines[lines.length - 1]; // Keep incomplete line in buffer
973
-
974
- for (let i = 0; i < lines.length - 1; i++) {
975
- const line = lines[i].trim();
976
- if (line) {
977
- try {
978
- const downloadInfo = JSON.parse(line);
979
-
980
- // If we printed lines before, move cursor back up
981
- if (previousLineCount > 0) {
982
- process.stdout.write(`\x1B[${previousLineCount}A`); // Move up N lines
983
- process.stdout.write('\x1B[0J'); // Clear from cursor to end of screen
984
- }
985
-
986
- // Build all output as a single string
987
- let output = '';
988
- const addLine = (text = '') => {
989
- output += text + '\n';
990
- };
991
-
992
- if (downloadInfo.status === 'NO_CACHE' || downloadInfo.status === 'NO_MODELS') {
993
- addLine(downloadInfo.message);
994
- } else {
995
- // Build the display output
996
- for (const model of downloadInfo.models) {
997
- addLine(`Model: ${model.model}`);
998
- addLine(` Size: ${model.size_gb}GB`);
999
-
1000
- if (model.total_files > 0) {
1001
- const percentage = Math.round((model.files / model.total_files) * 100);
1002
- addLine(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
1003
-
1004
- // Show progress bar
1005
- const barLength = 30;
1006
- const filled = Math.round((percentage / 100) * barLength);
1007
- const empty = barLength - filled;
1008
- const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
1009
- addLine(` Progress: [${progressBar}] ${percentage}%`);
1010
- } else {
1011
- addLine(` Files: ${model.files}`);
1012
- }
1013
-
1014
- addLine(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
1015
- addLine(); // Empty line between models
1016
- }
1017
-
1018
- if (downloadInfo.vllm_processes > 0) {
1019
- addLine(`Active vLLM processes: ${downloadInfo.vllm_processes}`);
1020
- }
1021
-
1022
- addLine();
1023
- addLine(`Last updated: ${new Date().toLocaleTimeString()}`);
1024
- }
1025
-
1026
- // Write all output at once and count lines
1027
- process.stdout.write(output);
1028
- previousLineCount = (output.match(/\n/g) || []).length;
1029
-
1030
- } catch (e) {
1031
- // Not JSON, just display as is
1032
- console.log(line);
1033
- }
1034
- }
1035
- }
1036
- });
1037
-
1038
- proc.stderr.on('data', (data) => {
1039
- process.stderr.write(data);
1040
- });
1041
-
1042
- proc.on('close', () => {
1043
- cleanup(); // Restore cursor
1044
- resolve();
1045
- });
1046
- });
1047
- }
1048
- }
1049
-
1050
- _displayDownloadInfo(downloadInfo) {
1051
- for (const model of downloadInfo.models) {
1052
- console.log(`\nModel: ${model.model}`);
1053
- console.log(` Size: ${model.size_gb}GB`);
1054
-
1055
- if (model.total_files > 0) {
1056
- const percentage = Math.round((model.files / model.total_files) * 100);
1057
- console.log(` Files: ${model.files}/${model.total_files} (${percentage}%)`);
1058
-
1059
- // Show progress bar
1060
- const barLength = 30;
1061
- const filled = Math.round((percentage / 100) * barLength);
1062
- const empty = barLength - filled;
1063
- const progressBar = '█'.repeat(filled) + '░'.repeat(empty);
1064
- console.log(` Progress: [${progressBar}] ${percentage}%`);
1065
- } else {
1066
- console.log(` Files: ${model.files}`);
1067
- }
1068
-
1069
- console.log(` Status: ${model.active ? '⏬ Downloading' : '⏸ Idle'}`);
1070
- }
1071
-
1072
- if (downloadInfo.vllm_processes > 0) {
1073
- console.log(`\nActive vLLM processes: ${downloadInfo.vllm_processes}`);
1074
- }
1075
-
1076
- // Show timestamp
1077
- console.log(`\nLast updated: ${new Date().toLocaleTimeString()}`);
1078
- }
1079
-
1080
- async prompt(name, message, podName = null) {
1081
- // Get model info
1082
- const models = this.getRunningModels(podName);
1083
- const model = models[name];
1084
-
1085
- if (!model || !model.url) {
1086
- console.error(`Model '${name}' is not running${podName ? ` on pod '${podName}'` : ''}`);
1087
- console.error('Running models:', Object.keys(models).join(', ') || 'none');
1088
- process.exit(1);
1089
- }
1090
-
1091
- // Make API call directly to the model's external URL
1092
- const url = `${model.url}/chat/completions`;
1093
- const payload = {
1094
- model: model.model_id,
1095
- messages: [{ role: 'user', content: message }],
1096
- max_tokens: 500,
1097
- temperature: 0.7
1098
- };
1099
-
1100
- try {
1101
- const response = await fetch(url, {
1102
- method: 'POST',
1103
- headers: { 'Content-Type': 'application/json' },
1104
- body: JSON.stringify(payload)
1105
- });
1106
-
1107
- if (!response.ok) {
1108
- throw new Error(`HTTP ${response.status}: ${await response.text()}`);
1109
- }
1110
-
1111
- const data = await response.json();
1112
- console.log(data.choices[0].message.content);
1113
- } catch (error) {
1114
- console.error('Error:', error.message);
1115
- process.exit(1);
1116
- }
1117
- }
1118
-
1119
- showHelp() {
1120
- console.log('\npi CLI\n');
1121
-
1122
- console.log('Pod Management:');
1123
- console.log(' pi setup <pod-name> <ssh_command> Configure and activate a pod');
1124
- console.log(' pi pods List all pods (active pod marked)');
1125
- console.log(' pi pod <pod-name> Switch active pod');
1126
- console.log(' pi pod remove <pod-name> Remove pod from config\n');
1127
- console.log('Model Management:');
1128
- console.log(' pi list [--pod <pod-name>] List running models');
1129
- console.log(' pi search <query> Search HuggingFace models');
1130
- console.log(' pi start <model> [options] Start a model');
1131
- console.log(' pi stop [name] [--pod <pod-name>] Stop a model (or all if no name)');
1132
- console.log(' pi logs <name> [--pod <pod-name>] View model logs');
1133
- console.log(' pi prompt <name> <msg> [--pod <pod-name>] Chat with a model');
1134
- console.log(' pi downloads [--pod <pod-name>] [--live] Check model download progress (--live for continuous monitoring)\n');
1135
- console.log('Start Options:');
1136
- console.log(' --name <name> Model alias (default: auto-generated)');
1137
- console.log(' --context <size> Context window: 4k, 8k, 16k, 32k, 64k, 128k (default: model default)');
1138
- console.log(' --memory <percent> GPU memory: 30%, 50%, 90% (default: 90%)');
1139
- console.log(' --all-gpus Use all GPUs with tensor parallelism');
1140
- console.log(' --pod <pod-name> Run on specific pod without switching active pod');
1141
- console.log(' --debug Enable debug logging for vLLM');
1142
- console.log(' --vllm-args Pass remaining args directly to vLLM\n');
1143
- console.log('Utility:');
1144
- console.log(' pi shell [--pod <pod-name>] SSH into pod');
1145
- console.log(' pi ssh [--pod <pod-name>] <cmd> Run SSH command on pod');
1146
-
1147
- console.log('\nQuick Examples:');
1148
- console.log(' pi start Qwen/Qwen2.5-7B-Instruct --name qwen');
1149
- console.log(' pi prompt qwen "What is 2+2?"');
1150
- console.log('\n # Qwen3-Coder on 8xH200 with custom vLLM args:');
1151
- console.log(' pi start Qwen/Qwen3-Coder-480B-A35B-Instruct-FP8 --name qwen-coder --vllm-args \\');
1152
- console.log(' --data-parallel-size 8 --enable-expert-parallel \\');
1153
- console.log(' --tool-call-parser qwen3_coder --enable-auto-tool-choice --gpu-memory-utilization 0.95 --max-model-len 200000');
1154
-
1155
- if (this.config.active && this.config.pods[this.config.active]) {
1156
- console.log(`\nActive pod: ${this.config.active} (${this.config.pods[this.config.active].ssh})`);
1157
- } else {
1158
- console.log('\nNo active pod');
1159
- }
1160
- }
1161
-
1162
- getRunningModels(podName = null) {
1163
- try {
1164
- const output = this.ssh('python3 vllm_manager.py list', false, false, podName);
1165
- const models = {};
1166
-
1167
- // Parse the output to extract model info
1168
- const lines = output.split('\n');
1169
- let currentModel = null;
1170
-
1171
- for (const line of lines) {
1172
- if (line.match(/^[a-zA-Z0-9_-]+:$/)) {
1173
- currentModel = line.slice(0, -1);
1174
- models[currentModel] = {};
1175
- } else if (currentModel) {
1176
- if (line.includes('Model:')) {
1177
- models[currentModel].model_id = line.split('Model:')[1].trim();
1178
- } else if (line.includes('Port:')) {
1179
- models[currentModel].port = parseInt(line.split('Port:')[1].trim());
1180
- } else if (line.includes('URL:')) {
1181
- models[currentModel].url = line.split('URL:')[1].trim();
1182
- }
1183
- }
1184
- }
1185
-
1186
- return models;
1187
- } catch (e) {
1188
- return {};
1189
- }
1190
- }
1191
-
1192
- async run() {
1193
- const [,, command, ...args] = process.argv;
1194
-
1195
- // Handle --version flag
1196
- if (command === '--version' || command === '-v') {
1197
- const packageJsonPath = path.join(__dirname, 'package.json');
1198
- try {
1199
- const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf8'));
1200
- console.log(packageJson.version);
1201
- } catch (error) {
1202
- console.error('Error reading version:', error.message);
1203
- process.exit(1);
1204
- }
1205
- return;
1206
- }
1207
-
1208
- switch (command) {
1209
- case 'setup': {
1210
- if (args.length < 2) {
1211
- console.error('Usage: pi setup <pod-name> <ssh_command>');
1212
- console.error('Example: pi setup prod "root@135.181.71.41 -p 22"');
1213
- process.exit(1);
1214
- }
1215
- const podName = args[0];
1216
- const sshCmd = args.slice(1).join(' ');
1217
- this.setup(podName, sshCmd);
1218
- break;
1219
- }
1220
- case 'pods':
1221
- this.listPods();
1222
- break;
1223
-
1224
- case 'pod':
1225
- if (!args[0]) {
1226
- console.error('Usage: pi pod <pod-name>');
1227
- console.error(' pi pod remove <pod-name>');
1228
- process.exit(1);
1229
- }
1230
- if (args[0] === 'remove' && args[1]) {
1231
- this.removePod(args[1]);
1232
- } else {
1233
- this.switchPod(args[0]);
1234
- }
1235
- break;
1236
-
1237
- case 'list':
1238
- case 'ls': {
1239
- let podName = null;
1240
-
1241
- // Parse --pod parameter
1242
- const podIndex = args.indexOf('--pod');
1243
- if (podIndex !== -1 && args[podIndex + 1]) {
1244
- podName = args[podIndex + 1];
1245
- }
1246
-
1247
- this.list(podName);
1248
- break;
1249
- }
1250
-
1251
- case 'search':
1252
- if (!args[0]) {
1253
- console.error('Usage: pi search <query>');
1254
- console.error('Example: pi search qwen');
1255
- process.exit(1);
1256
- }
1257
- await this.searchModels(args[0]);
1258
- break;
1259
-
1260
- case 'downloads': {
1261
- let podName = null;
1262
- let live = false;
1263
-
1264
- // Parse --pod parameter
1265
- const podIndex = args.indexOf('--pod');
1266
- if (podIndex !== -1 && args[podIndex + 1]) {
1267
- podName = args[podIndex + 1];
1268
- }
1269
-
1270
- // Parse --live parameter
1271
- if (args.includes('--live')) {
1272
- live = true;
1273
- }
1274
-
1275
- await this.checkDownloads(podName, live);
1276
- break;
1277
- }
1278
-
1279
- case 'start':
1280
- await this.handleStart(args);
1281
- break;
1282
-
1283
- case 'stop': {
1284
- let modelName = args[0];
1285
- let podName = null;
1286
-
1287
- // Parse --pod parameter
1288
- const podIndex = args.indexOf('--pod');
1289
- if (podIndex !== -1 && args[podIndex + 1]) {
1290
- podName = args[podIndex + 1];
1291
- // Remove --pod and its value from args
1292
- args.splice(podIndex, 2);
1293
- modelName = args[0]; // Update modelName after removing --pod
1294
- }
1295
-
1296
- this.stop(modelName, podName);
1297
- break;
1298
- }
1299
-
1300
- case 'logs': {
1301
- let modelName = args[0];
1302
- let podName = null;
1303
-
1304
- // Parse --pod parameter
1305
- const podIndex = args.indexOf('--pod');
1306
- if (podIndex !== -1 && args[podIndex + 1]) {
1307
- podName = args[podIndex + 1];
1308
- // Remove --pod and its value from args
1309
- args.splice(podIndex, 2);
1310
- modelName = args[0]; // Update modelName after removing --pod
1311
- }
1312
-
1313
- await this.logs(modelName, false, podName); // autoExit = false for manual logs command
1314
- break;
1315
- }
1316
-
1317
- case 'prompt': {
1318
- if (args.length < 2) {
1319
- console.error('Usage: pi prompt <model_name> "<message>" [--pod <pod-name>]');
1320
- console.error('Example: pi prompt phi3 "Hey, how you going"');
1321
- process.exit(1);
1322
- }
1323
- let modelName = args[0];
1324
- let podName = null;
1325
-
1326
- // Parse --pod parameter
1327
- const podIndex = args.indexOf('--pod');
1328
- if (podIndex !== -1 && args[podIndex + 1]) {
1329
- podName = args[podIndex + 1];
1330
- // Remove --pod and its value from args
1331
- args.splice(podIndex, 2);
1332
- }
1333
-
1334
- const message = args.slice(1).join(' ');
1335
- this.prompt(modelName, message, podName);
1336
- break;
1337
- }
1338
- case 'shell': {
1339
- let podName = null;
1340
-
1341
- // Parse --pod parameter
1342
- const podIndex = args.indexOf('--pod');
1343
- if (podIndex !== -1 && args[podIndex + 1]) {
1344
- podName = args[podIndex + 1];
1345
- }
1346
-
1347
- await this.shell(podName);
1348
- break;
1349
- }
1350
-
1351
- case 'ssh': {
1352
- let podName = null;
1353
- let sshArgs = [...args];
1354
-
1355
- // For ssh, --pod must be the first parameter if present
1356
- if (args[0] === '--pod' && args[1]) {
1357
- podName = args[1];
1358
- sshArgs = args.slice(2); // Remove --pod and podName from args
1359
- }
1360
-
1361
- // Pass through any SSH command
1362
- if (sshArgs.length > 0) {
1363
- const output = this.ssh(sshArgs.join(' '), false, false, podName);
1364
- console.log(output);
1365
- } else {
1366
- await this.shell(podName);
1367
- }
1368
- break;
1369
- }
1370
-
1371
- default:
1372
- this.showHelp();
1373
- }
1374
- }
1375
- }
1376
-
1377
- // Run CLI
1378
- const cli = new PiCli();
1379
- cli.run().catch(console.error);