@mariozechner/pi 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/pod_setup.sh ADDED
@@ -0,0 +1,133 @@
1
+ #!/bin/bash
2
+ # Pod setup script for GPU instances. Assumes Ubuntu based system with CUDA drivers installed.
3
+
4
+ set -e
5
+
6
+ echo "=== Pod Setup ==="
7
+
8
+ # Update and install basics
9
+ sudo apt update
10
+ sudo apt install -y python3-pip python3-venv
11
+
12
+ # Create virtual environment for vLLM
13
+ VENV_PATH="$HOME/vllm_env"
14
+ echo "Creating virtual environment at $VENV_PATH..."
15
+ python3 -m venv "$VENV_PATH"
16
+
17
+ # Activate virtual environment
18
+ source "$VENV_PATH/bin/activate"
19
+
20
+ # Upgrade pip in virtual environment
21
+ pip install --upgrade pip
22
+
23
+ # Install vLLM and dependencies
24
+ echo "Installing vLLM and dependencies..."
25
+
26
+ # Detect CUDA version and install appropriate PyTorch
27
+ # First try nvidia-smi (more commonly available), then nvcc
28
+ if command -v nvidia-smi &> /dev/null; then
29
+ CUDA_VERSION=$(nvidia-smi | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+' | head -1)
30
+ echo "Detected CUDA version from nvidia-smi: $CUDA_VERSION"
31
+ elif command -v nvcc &> /dev/null; then
32
+ CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
33
+ echo "Detected CUDA version from nvcc: $CUDA_VERSION"
34
+ else
35
+ CUDA_VERSION=""
36
+ fi
37
+
38
+ if [ -n "$CUDA_VERSION" ]; then
39
+ # Map CUDA version to PyTorch index
40
+ case "$CUDA_VERSION" in
41
+ 12.8*)
42
+ echo "Installing PyTorch with CUDA 12.8 support"
43
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
44
+ ;;
45
+ 12.7*)
46
+ echo "Installing PyTorch with CUDA 12.7 support"
47
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu127
48
+ ;;
49
+ 12.6*)
50
+ echo "Installing PyTorch with CUDA 12.6 support"
51
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
52
+ ;;
53
+ 12.4*)
54
+ echo "Installing PyTorch with CUDA 12.4 support"
55
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
56
+ ;;
57
+ 12.1*)
58
+ echo "Installing PyTorch with CUDA 12.1 support"
59
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
60
+ ;;
61
+ 11.8*)
62
+ echo "Installing PyTorch with CUDA 11.8 support"
63
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
64
+ ;;
65
+ *)
66
+ echo "CUDA $CUDA_VERSION detected - using default PyTorch (may not be optimal)"
67
+ pip install torch torchvision torchaudio
68
+ ;;
69
+ esac
70
+ else
71
+ echo "WARNING: nvcc not found, installing default PyTorch"
72
+ pip install torch torchvision torchaudio
73
+ fi
74
+
75
+ pip install vllm huggingface-hub psutil
76
+
77
+ # Install FlashInfer for better performance (~15% sampler latency reduction)
78
+ echo "Installing FlashInfer for performance optimization..."
79
+ echo "Building FlashInfer from source..."
80
+
81
+ # Clone and build FlashInfer from source
82
+ cd /tmp
83
+ if [ -d "flashinfer" ]; then
84
+ rm -rf flashinfer
85
+ fi
86
+
87
+ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
88
+ cd flashinfer
89
+
90
+ # Install from source
91
+ if python -m pip install -v .; then
92
+ echo "FlashInfer successfully built from source"
93
+ else
94
+ echo "FlashInfer installation failed (optional)"
95
+ fi
96
+
97
+ # Clean up
98
+ cd /
99
+ rm -rf /tmp/flashinfer
100
+
101
+ # Setup HuggingFace token from environment
102
+ if [ -z "$HF_TOKEN" ]; then
103
+ echo "ERROR: HF_TOKEN environment variable not set"
104
+ echo "Please export HF_TOKEN before running setup"
105
+ exit 1
106
+ fi
107
+
108
+ # Create directory for vLLM config
109
+ mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
110
+
111
+ # Create .pirc file for consistent environment
112
+ cat > ~/.pirc << EOF
113
+ # Prime Intellect CLI environment
114
+ # This file is sourced by all pi commands
115
+
116
+ # Activate vLLM virtual environment if it exists
117
+ if [ -d "\$HOME/vllm_env" ]; then
118
+ source "\$HOME/vllm_env/bin/activate"
119
+ fi
120
+
121
+ # Performance optimizations
122
+ export VLLM_USE_FLASHINFER_SAMPLER=1
123
+ export VLLM_USE_DEEP_GEMM=1
124
+ export VLLM_NO_USAGE_STATS=1
125
+ export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
126
+
127
+ # HuggingFace tokens
128
+ export HF_TOKEN="$HF_TOKEN"
129
+ export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
130
+ EOF
131
+
132
+ # Copy manager script
133
+ echo "Setup complete!"
@@ -0,0 +1,499 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple vLLM Manager - Run multiple models on different ports
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import subprocess as sp
9
+ import psutil
10
+ import socket
11
+ import base64
12
+ from pathlib import Path
13
+ from typing import Dict, Optional
14
+ from datetime import datetime
15
+
16
+ # Config
17
+ CONFIG_FILE = Path.home() / ".vllm_manager.json"
18
+ LOGS_DIR = Path.home() / ".vllm_logs"
19
+ BASE_PORT = 8001 # Start from 8001, leave 8000 free
20
+
21
+ class VLLMManager:
22
+ def __init__(self):
23
+ self.models = {} # name -> {pid, port, model_id, log_file}
24
+ self.load()
25
+ LOGS_DIR.mkdir(exist_ok=True)
26
+
27
+ def load(self):
28
+ if CONFIG_FILE.exists():
29
+ with open(CONFIG_FILE) as f:
30
+ self.models = json.load(f)
31
+
32
+ def save(self):
33
+ with open(CONFIG_FILE, "w") as f:
34
+ json.dump(self.models, f, indent=2)
35
+
36
+ def is_running(self, pid: int) -> bool:
37
+ try:
38
+ process = psutil.Process(pid)
39
+ return process.is_running()
40
+ except:
41
+ return False
42
+
43
+ def find_free_port(self) -> int:
44
+ used_ports = {info['port'] for info in self.models.values()}
45
+ for port in range(BASE_PORT, BASE_PORT + 10):
46
+ if port not in used_ports:
47
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
48
+ try:
49
+ s.bind(('', port))
50
+ return port
51
+ except:
52
+ continue
53
+ raise Exception("No free ports")
54
+
55
+ def get_gpu_count(self) -> int:
56
+ try:
57
+ result = sp.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
58
+ capture_output=True, text=True)
59
+ if result.returncode == 0:
60
+ return len(result.stdout.strip().split('\n'))
61
+ except:
62
+ pass
63
+ return 1
64
+
65
+ def find_available_gpu(self) -> Optional[int]:
66
+ """Find the next available GPU that's not heavily used"""
67
+ gpu_count = self.get_gpu_count()
68
+ if gpu_count == 1:
69
+ return None # Let vLLM use default
70
+
71
+ # Get GPUs used by our models
72
+ used_gpus = set()
73
+ for info in self.models.values():
74
+ if 'gpu_id' in info:
75
+ used_gpus.add(info['gpu_id'])
76
+
77
+ # Find first unused GPU
78
+ for gpu_id in range(gpu_count):
79
+ if gpu_id not in used_gpus:
80
+ return gpu_id
81
+
82
+ # If all GPUs have at least one model, find the least loaded
83
+ # For now, just cycle through
84
+ return len(self.models) % gpu_count
85
+
86
+ def list(self):
87
+ # Clean up dead processes
88
+ to_remove = []
89
+ for name, info in self.models.items():
90
+ if not self.is_running(info['pid']):
91
+ to_remove.append(name)
92
+
93
+ for name in to_remove:
94
+ del self.models[name]
95
+
96
+ if to_remove:
97
+ self.save()
98
+
99
+ return self.models
100
+
101
+ def get_tool_parser_for_model(self, model_id: str) -> tuple[str, Optional[str]]:
102
+ """Determine the appropriate tool parser and chat template for a model."""
103
+ model_lower = model_id.lower()
104
+
105
+ # Qwen models
106
+ if 'qwen' in model_lower:
107
+ if 'qwen3-coder' in model_lower:
108
+ return "qwen3_coder", None # Try qwen3_coder if it exists
109
+ elif 'qwen2.5' in model_lower or 'qwq' in model_lower:
110
+ return "hermes", None # Qwen2.5 uses hermes
111
+ else:
112
+ return "hermes", None # Default for other Qwen models
113
+
114
+ # Mistral models
115
+ elif 'mistral' in model_lower:
116
+ return "mistral", "examples/tool_chat_template_mistral_parallel.jinja"
117
+
118
+ # Llama models
119
+ elif 'llama' in model_lower or 'meta-llama' in model_lower:
120
+ if 'llama-4' in model_lower:
121
+ return "llama4_pythonic", "examples/tool_chat_template_llama4_pythonic.jinja"
122
+ elif 'llama-3.2' in model_lower:
123
+ return "llama3_json", "examples/tool_chat_template_llama3.2_json.jinja"
124
+ elif 'llama-3.1' in model_lower:
125
+ return "llama3_json", "examples/tool_chat_template_llama3.1_json.jinja"
126
+ else:
127
+ return "llama3_json", None
128
+
129
+ # InternLM models
130
+ elif 'internlm' in model_lower:
131
+ return "internlm", "examples/tool_chat_template_internlm2_tool.jinja"
132
+
133
+ # Jamba models
134
+ elif 'jamba' in model_lower:
135
+ return "jamba", None
136
+
137
+ # Granite models
138
+ elif 'granite' in model_lower:
139
+ if 'granite-20b-functioncalling' in model_lower:
140
+ return "granite-20b-fc", "examples/tool_chat_template_granite_20b_fc.jinja"
141
+ elif 'granite-3.0' in model_lower:
142
+ return "granite", "examples/tool_chat_template_granite.jinja"
143
+ else:
144
+ return "granite", None
145
+
146
+ # DeepSeek models
147
+ elif 'deepseek' in model_lower:
148
+ if 'deepseek-r1' in model_lower:
149
+ return "deepseek_v3", "examples/tool_chat_template_deepseekr1.jinja"
150
+ elif 'deepseek-v3' in model_lower:
151
+ return "deepseek_v3", "examples/tool_chat_template_deepseekv3.jinja"
152
+ else:
153
+ return "hermes", None # Fallback for other DeepSeek models
154
+
155
+ # xLAM models
156
+ elif 'xlam' in model_lower:
157
+ if 'llama-xlam' in model_lower:
158
+ return "xlam", "examples/tool_chat_template_xlam_llama.jinja"
159
+ else:
160
+ return "xlam", "examples/tool_chat_template_xlam_qwen.jinja"
161
+
162
+ # Phi models (Microsoft)
163
+ elif 'phi' in model_lower:
164
+ # Phi models don't have tool calling tokens, disable by default
165
+ return None, None
166
+
167
+ # Default fallback
168
+ else:
169
+ return "hermes", None
170
+
171
+ def start(self, model_id: str, name: Optional[str] = None, max_len: Optional[int] = None, gpu_memory_utilization: float = None, tensor_parallel_size: int = 1, gpu_ids: Optional[str] = None):
172
+ # Generate name
173
+ if not name:
174
+ name = model_id.split('/')[-1].lower().replace('-', '_')
175
+
176
+ # Check if already running
177
+ if name in self.models and self.is_running(self.models[name]['pid']):
178
+ return self.models[name]
179
+
180
+ # Find port
181
+ port = self.find_free_port()
182
+
183
+ # Create log file
184
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
185
+ log_file = LOGS_DIR / f"{name}_{timestamp}.log"
186
+
187
+ # Set GPU memory utilization if not specified
188
+ if gpu_memory_utilization is None:
189
+ print("WARNING: No GPU memory utilization specified, defaulting to 90%")
190
+ print(" Consider specifying based on model size to run multiple models")
191
+ print(" Examples: 0.2 for small models, 0.5 for medium, 0.9 for large")
192
+ gpu_memory_utilization = 0.9
193
+
194
+ # Get appropriate tool parser for the model
195
+ tool_parser, chat_template = self.get_tool_parser_for_model(model_id)
196
+
197
+ # Start vLLM (use venv python if available)
198
+ python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
199
+ cmd = [
200
+ python_cmd, "-m", "vllm.entrypoints.openai.api_server",
201
+ "--model", model_id,
202
+ "--host", "0.0.0.0",
203
+ "--port", str(port),
204
+ "--gpu-memory-utilization", str(gpu_memory_utilization)
205
+ ]
206
+
207
+ # Only add tool calling if a parser is available
208
+ if tool_parser:
209
+ print(f"Auto-detected tool parser: {tool_parser}" + (f" with chat template: {chat_template}" if chat_template else ""))
210
+ cmd.extend([
211
+ "--enable-auto-tool-choice",
212
+ "--tool-call-parser", tool_parser
213
+ ])
214
+ # Add chat template if specified
215
+ if chat_template:
216
+ cmd.extend(["--chat-template", chat_template])
217
+ else:
218
+ print(f"Tool calling disabled for {model_id} (no compatible parser)")
219
+
220
+ # Only add max-model-len if specified
221
+ if max_len is not None:
222
+ cmd.extend(["--max-model-len", str(max_len)])
223
+
224
+ # Add tensor parallel size if > 1
225
+ if tensor_parallel_size > 1:
226
+ cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
227
+
228
+ # Use environment as-is (already configured by .pirc)
229
+ env = os.environ.copy()
230
+
231
+ # Handle GPU assignment
232
+ assigned_gpu = None
233
+ if tensor_parallel_size > 1:
234
+ # Multi-GPU: use all GPUs
235
+ gpu_count = self.get_gpu_count()
236
+ if tensor_parallel_size > gpu_count:
237
+ print(f"Warning: Requested {tensor_parallel_size} GPUs but only {gpu_count} available")
238
+ tensor_parallel_size = gpu_count
239
+ else:
240
+ # Single GPU: find available GPU
241
+ if gpu_ids:
242
+ env['CUDA_VISIBLE_DEVICES'] = gpu_ids
243
+ assigned_gpu = int(gpu_ids.split(',')[0])
244
+ else:
245
+ assigned_gpu = self.find_available_gpu()
246
+ if assigned_gpu is not None:
247
+ env['CUDA_VISIBLE_DEVICES'] = str(assigned_gpu)
248
+ print(f"Auto-assigned to GPU {assigned_gpu}")
249
+
250
+
251
+ # Open log file and start process
252
+ with open(log_file, 'w') as f:
253
+ f.write(f"=== Starting {model_id} at {datetime.now()} ===\n")
254
+ f.write(f"Command: {' '.join(cmd)}\n")
255
+ if tool_parser:
256
+ f.write(f"Tool Parser: {tool_parser}\n")
257
+ if chat_template:
258
+ f.write(f"Chat Template: {chat_template}\n")
259
+ else:
260
+ f.write(f"Tool Calling: Disabled (no compatible parser)\n")
261
+ if gpu_ids:
262
+ f.write(f"CUDA_VISIBLE_DEVICES: {gpu_ids}\n")
263
+ if tensor_parallel_size > 1:
264
+ f.write(f"Tensor Parallel Size: {tensor_parallel_size}\n")
265
+ f.write("=" * 60 + "\n\n")
266
+ f.flush()
267
+
268
+ process = sp.Popen(
269
+ cmd,
270
+ stdout=f,
271
+ stderr=sp.STDOUT, # Merge stderr into stdout
272
+ bufsize=1, # Line buffered
273
+ universal_newlines=True,
274
+ env=env # Pass the modified environment
275
+ )
276
+
277
+ # Save info
278
+ self.models[name] = {
279
+ "pid": process.pid,
280
+ "port": port,
281
+ "model_id": model_id,
282
+ "log_file": str(log_file),
283
+ "gpu_id": assigned_gpu,
284
+ "tensor_parallel_size": tensor_parallel_size if tensor_parallel_size > 1 else 1
285
+ }
286
+ self.save()
287
+
288
+ return {"name": name, "port": port, "pid": process.pid, "log_file": str(log_file)}
289
+
290
+ def start_raw(self, model_id: str, name: str, vllm_args: str):
291
+ # Check if already running
292
+ if name in self.models and self.is_running(self.models[name]['pid']):
293
+ return self.models[name]
294
+
295
+ # Find port
296
+ port = self.find_free_port()
297
+
298
+ # Create log file
299
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
300
+ log_file = LOGS_DIR / f"{name}_{timestamp}.log"
301
+
302
+ # Start vLLM with raw arguments
303
+ python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
304
+
305
+ # Base command - ensure vllm_args is properly quoted
306
+ cmd = f'{python_cmd} -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
307
+
308
+ # Use environment as-is (already configured by .pirc)
309
+ env = os.environ.copy()
310
+
311
+
312
+ # Open log file and start process
313
+ with open(log_file, 'w') as f:
314
+ f.write(f"=== Starting {model_id} at {datetime.now()} ===")
315
+ f.write(f"\nCommand: {cmd}\n")
316
+ f.write("=" * 60 + "\n\n")
317
+ f.flush()
318
+
319
+ # Use shell=True for the command string
320
+ process = sp.Popen(
321
+ cmd,
322
+ shell=True,
323
+ stdout=f,
324
+ stderr=sp.STDOUT, # Merge stderr into stdout
325
+ bufsize=1, # Line buffered
326
+ universal_newlines=True,
327
+ env=env # Pass the modified environment
328
+ )
329
+
330
+ # Save info
331
+ self.models[name] = {
332
+ "pid": process.pid,
333
+ "port": port,
334
+ "model_id": model_id,
335
+ "log_file": str(log_file),
336
+ "raw_args": vllm_args
337
+ }
338
+ self.save()
339
+
340
+ return {"name": name, "port": port, "pid": process.pid, "log_file": str(log_file)}
341
+
342
+ def stop(self, name: str):
343
+ if name not in self.models:
344
+ return False
345
+
346
+ info = self.models[name]
347
+ try:
348
+ process = psutil.Process(info['pid'])
349
+ process.terminate()
350
+ process.wait(timeout=5)
351
+ except:
352
+ pass
353
+
354
+ del self.models[name]
355
+ self.save()
356
+ return True
357
+
358
+ def logs(self, name: str, lines: int = 50):
359
+ if name not in self.models:
360
+ return None
361
+
362
+ log_file = self.models[name].get('log_file')
363
+ if not log_file or not Path(log_file).exists():
364
+ return None
365
+
366
+ # Read last N lines
367
+ with open(log_file, 'r') as f:
368
+ all_lines = f.readlines()
369
+ return ''.join(all_lines[-lines:])
370
+
371
+ def main():
372
+ import sys
373
+
374
+ manager = VLLMManager()
375
+
376
+ if len(sys.argv) < 2:
377
+ print("Usage: vllm_manager.py [list|start|stop|logs] ...")
378
+ sys.exit(1)
379
+
380
+ cmd = sys.argv[1]
381
+
382
+ if cmd == "list":
383
+ models = manager.list()
384
+ if not models:
385
+ print("No models running")
386
+ else:
387
+ # Get external IP
388
+ try:
389
+ # Try to get IP from default interface
390
+ result = sp.run(['hostname', '-I'], capture_output=True, text=True)
391
+ if result.returncode == 0 and result.stdout.strip():
392
+ host_ip = result.stdout.strip().split()[0]
393
+ else:
394
+ host_ip = socket.gethostbyname(socket.gethostname())
395
+ except:
396
+ host_ip = socket.gethostbyname(socket.gethostname())
397
+ print(f"Running models:")
398
+ for name, info in models.items():
399
+ print(f"\n{name}:")
400
+ print(f" Model: {info['model_id']}")
401
+ print(f" HF: https://huggingface.co/{info['model_id']}")
402
+ print(f" Port: {info['port']}")
403
+ if 'tensor_parallel_size' in info and info.get('tensor_parallel_size', 1) > 1:
404
+ print(f" GPUs: {info.get('tensor_parallel_size', 1)} (tensor parallel)")
405
+ elif 'gpu_id' in info and info['gpu_id'] is not None:
406
+ print(f" GPU: {info['gpu_id']}")
407
+ print(f" URL: http://{host_ip}:{info['port']}/v1")
408
+ if 'log_file' in info:
409
+ print(f" Logs: {info['log_file']}")
410
+
411
+ elif cmd == "start":
412
+ if len(sys.argv) < 3:
413
+ print("Usage: vllm_manager.py start <model_id> [name] [max_len] [gpu_memory] [tensor_parallel_size]")
414
+ sys.exit(1)
415
+
416
+ model_id = sys.argv[2]
417
+ name = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] not in ['""', ''] else None
418
+ max_len = int(sys.argv[4]) if len(sys.argv) > 4 and sys.argv[4] not in ['""', ''] else None
419
+ gpu_memory = float(sys.argv[5]) if len(sys.argv) > 5 else None
420
+ tensor_parallel = int(sys.argv[6]) if len(sys.argv) > 6 else 1
421
+
422
+ model_result = manager.start(model_id, name, max_len, gpu_memory, tensor_parallel)
423
+ # Get external IP
424
+ try:
425
+ # Try to get IP from default interface
426
+ ip_result = sp.run(['hostname', '-I'], capture_output=True, text=True)
427
+ if ip_result.returncode == 0 and ip_result.stdout.strip():
428
+ host_ip = ip_result.stdout.strip().split()[0]
429
+ else:
430
+ host_ip = socket.gethostbyname(socket.gethostname())
431
+ except:
432
+ host_ip = socket.gethostbyname(socket.gethostname())
433
+
434
+ print(f"Started {model_result['name']}")
435
+ print(f"URL: http://{host_ip}:{model_result['port']}/v1")
436
+ print(f"\nExport for OpenAI clients:")
437
+ print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
438
+
439
+ elif cmd == "start_raw":
440
+ if len(sys.argv) < 5:
441
+ print("Usage: vllm_manager.py start_raw <model_id> <name> <vllm_args>")
442
+ sys.exit(1)
443
+
444
+ model_id = sys.argv[2]
445
+ name = sys.argv[3]
446
+ vllm_args_base64 = sys.argv[4]
447
+
448
+ # Decode base64 arguments
449
+ vllm_args = base64.b64decode(vllm_args_base64).decode('utf-8')
450
+ print(f"DEBUG: Decoded vllm_args: '{vllm_args}'")
451
+
452
+ model_result = manager.start_raw(model_id, name, vllm_args)
453
+ # Get external IP
454
+ try:
455
+ # Try to get IP from default interface
456
+ ip_result = sp.run(['hostname', '-I'], capture_output=True, text=True)
457
+ if ip_result.returncode == 0 and ip_result.stdout.strip():
458
+ host_ip = ip_result.stdout.strip().split()[0]
459
+ else:
460
+ host_ip = socket.gethostbyname(socket.gethostname())
461
+ except:
462
+ host_ip = socket.gethostbyname(socket.gethostname())
463
+
464
+ print(f"Started {model_result['name']}")
465
+ print(f"URL: http://{host_ip}:{model_result['port']}/v1")
466
+ print(f"\nExport for OpenAI clients:")
467
+ print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
468
+
469
+ elif cmd == "stop":
470
+ if len(sys.argv) < 3:
471
+ print("Usage: vllm_manager.py stop <name>")
472
+ sys.exit(1)
473
+
474
+ name = sys.argv[2]
475
+ if manager.stop(name):
476
+ print(f"Stopped {name}")
477
+ else:
478
+ print(f"Model {name} not found")
479
+
480
+ elif cmd == "logs":
481
+ if len(sys.argv) < 3:
482
+ print("Usage: vllm_manager.py logs <name> [lines]")
483
+ sys.exit(1)
484
+
485
+ name = sys.argv[2]
486
+ lines = int(sys.argv[3]) if len(sys.argv) > 3 else 50
487
+
488
+ logs = manager.logs(name, lines)
489
+ if logs is None:
490
+ print(f"No logs found for {name}")
491
+ else:
492
+ print(logs, end='')
493
+
494
+ else:
495
+ print(f"Unknown command: {cmd}")
496
+ sys.exit(1)
497
+
498
+ if __name__ == "__main__":
499
+ main()