@mariozechner/pi 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +317 -0
- package/package.json +42 -0
- package/pi +860 -0
- package/pod_setup.sh +133 -0
- package/vllm_manager.py +499 -0
package/pod_setup.sh
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Pod setup script for GPU instances. Assumes Ubuntu based system with CUDA drivers installed.
|
|
3
|
+
|
|
4
|
+
set -e
|
|
5
|
+
|
|
6
|
+
echo "=== Pod Setup ==="
|
|
7
|
+
|
|
8
|
+
# Update and install basics
|
|
9
|
+
sudo apt update
|
|
10
|
+
sudo apt install -y python3-pip python3-venv
|
|
11
|
+
|
|
12
|
+
# Create virtual environment for vLLM
|
|
13
|
+
VENV_PATH="$HOME/vllm_env"
|
|
14
|
+
echo "Creating virtual environment at $VENV_PATH..."
|
|
15
|
+
python3 -m venv "$VENV_PATH"
|
|
16
|
+
|
|
17
|
+
# Activate virtual environment
|
|
18
|
+
source "$VENV_PATH/bin/activate"
|
|
19
|
+
|
|
20
|
+
# Upgrade pip in virtual environment
|
|
21
|
+
pip install --upgrade pip
|
|
22
|
+
|
|
23
|
+
# Install vLLM and dependencies
|
|
24
|
+
echo "Installing vLLM and dependencies..."
|
|
25
|
+
|
|
26
|
+
# Detect CUDA version and install appropriate PyTorch
|
|
27
|
+
# First try nvidia-smi (more commonly available), then nvcc
|
|
28
|
+
if command -v nvidia-smi &> /dev/null; then
|
|
29
|
+
CUDA_VERSION=$(nvidia-smi | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+' | head -1)
|
|
30
|
+
echo "Detected CUDA version from nvidia-smi: $CUDA_VERSION"
|
|
31
|
+
elif command -v nvcc &> /dev/null; then
|
|
32
|
+
CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
|
|
33
|
+
echo "Detected CUDA version from nvcc: $CUDA_VERSION"
|
|
34
|
+
else
|
|
35
|
+
CUDA_VERSION=""
|
|
36
|
+
fi
|
|
37
|
+
|
|
38
|
+
if [ -n "$CUDA_VERSION" ]; then
|
|
39
|
+
# Map CUDA version to PyTorch index
|
|
40
|
+
case "$CUDA_VERSION" in
|
|
41
|
+
12.8*)
|
|
42
|
+
echo "Installing PyTorch with CUDA 12.8 support"
|
|
43
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
|
44
|
+
;;
|
|
45
|
+
12.7*)
|
|
46
|
+
echo "Installing PyTorch with CUDA 12.7 support"
|
|
47
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu127
|
|
48
|
+
;;
|
|
49
|
+
12.6*)
|
|
50
|
+
echo "Installing PyTorch with CUDA 12.6 support"
|
|
51
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
|
|
52
|
+
;;
|
|
53
|
+
12.4*)
|
|
54
|
+
echo "Installing PyTorch with CUDA 12.4 support"
|
|
55
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
|
56
|
+
;;
|
|
57
|
+
12.1*)
|
|
58
|
+
echo "Installing PyTorch with CUDA 12.1 support"
|
|
59
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
|
60
|
+
;;
|
|
61
|
+
11.8*)
|
|
62
|
+
echo "Installing PyTorch with CUDA 11.8 support"
|
|
63
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
64
|
+
;;
|
|
65
|
+
*)
|
|
66
|
+
echo "CUDA $CUDA_VERSION detected - using default PyTorch (may not be optimal)"
|
|
67
|
+
pip install torch torchvision torchaudio
|
|
68
|
+
;;
|
|
69
|
+
esac
|
|
70
|
+
else
|
|
71
|
+
echo "WARNING: nvcc not found, installing default PyTorch"
|
|
72
|
+
pip install torch torchvision torchaudio
|
|
73
|
+
fi
|
|
74
|
+
|
|
75
|
+
pip install vllm huggingface-hub psutil
|
|
76
|
+
|
|
77
|
+
# Install FlashInfer for better performance (~15% sampler latency reduction)
|
|
78
|
+
echo "Installing FlashInfer for performance optimization..."
|
|
79
|
+
echo "Building FlashInfer from source..."
|
|
80
|
+
|
|
81
|
+
# Clone and build FlashInfer from source
|
|
82
|
+
cd /tmp
|
|
83
|
+
if [ -d "flashinfer" ]; then
|
|
84
|
+
rm -rf flashinfer
|
|
85
|
+
fi
|
|
86
|
+
|
|
87
|
+
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
|
88
|
+
cd flashinfer
|
|
89
|
+
|
|
90
|
+
# Install from source
|
|
91
|
+
if python -m pip install -v .; then
|
|
92
|
+
echo "FlashInfer successfully built from source"
|
|
93
|
+
else
|
|
94
|
+
echo "FlashInfer installation failed (optional)"
|
|
95
|
+
fi
|
|
96
|
+
|
|
97
|
+
# Clean up
|
|
98
|
+
cd /
|
|
99
|
+
rm -rf /tmp/flashinfer
|
|
100
|
+
|
|
101
|
+
# Setup HuggingFace token from environment
|
|
102
|
+
if [ -z "$HF_TOKEN" ]; then
|
|
103
|
+
echo "ERROR: HF_TOKEN environment variable not set"
|
|
104
|
+
echo "Please export HF_TOKEN before running setup"
|
|
105
|
+
exit 1
|
|
106
|
+
fi
|
|
107
|
+
|
|
108
|
+
# Create directory for vLLM config
|
|
109
|
+
mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
|
|
110
|
+
|
|
111
|
+
# Create .pirc file for consistent environment
|
|
112
|
+
cat > ~/.pirc << EOF
|
|
113
|
+
# Prime Intellect CLI environment
|
|
114
|
+
# This file is sourced by all pi commands
|
|
115
|
+
|
|
116
|
+
# Activate vLLM virtual environment if it exists
|
|
117
|
+
if [ -d "\$HOME/vllm_env" ]; then
|
|
118
|
+
source "\$HOME/vllm_env/bin/activate"
|
|
119
|
+
fi
|
|
120
|
+
|
|
121
|
+
# Performance optimizations
|
|
122
|
+
export VLLM_USE_FLASHINFER_SAMPLER=1
|
|
123
|
+
export VLLM_USE_DEEP_GEMM=1
|
|
124
|
+
export VLLM_NO_USAGE_STATS=1
|
|
125
|
+
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
|
|
126
|
+
|
|
127
|
+
# HuggingFace tokens
|
|
128
|
+
export HF_TOKEN="$HF_TOKEN"
|
|
129
|
+
export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
|
|
130
|
+
EOF
|
|
131
|
+
|
|
132
|
+
# Copy manager script
|
|
133
|
+
echo "Setup complete!"
|
package/vllm_manager.py
ADDED
|
@@ -0,0 +1,499 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Simple vLLM Manager - Run multiple models on different ports
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import json
|
|
8
|
+
import subprocess as sp
|
|
9
|
+
import psutil
|
|
10
|
+
import socket
|
|
11
|
+
import base64
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Dict, Optional
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
|
|
16
|
+
# Config
|
|
17
|
+
CONFIG_FILE = Path.home() / ".vllm_manager.json"
|
|
18
|
+
LOGS_DIR = Path.home() / ".vllm_logs"
|
|
19
|
+
BASE_PORT = 8001 # Start from 8001, leave 8000 free
|
|
20
|
+
|
|
21
|
+
class VLLMManager:
|
|
22
|
+
def __init__(self):
|
|
23
|
+
self.models = {} # name -> {pid, port, model_id, log_file}
|
|
24
|
+
self.load()
|
|
25
|
+
LOGS_DIR.mkdir(exist_ok=True)
|
|
26
|
+
|
|
27
|
+
def load(self):
|
|
28
|
+
if CONFIG_FILE.exists():
|
|
29
|
+
with open(CONFIG_FILE) as f:
|
|
30
|
+
self.models = json.load(f)
|
|
31
|
+
|
|
32
|
+
def save(self):
|
|
33
|
+
with open(CONFIG_FILE, "w") as f:
|
|
34
|
+
json.dump(self.models, f, indent=2)
|
|
35
|
+
|
|
36
|
+
def is_running(self, pid: int) -> bool:
|
|
37
|
+
try:
|
|
38
|
+
process = psutil.Process(pid)
|
|
39
|
+
return process.is_running()
|
|
40
|
+
except:
|
|
41
|
+
return False
|
|
42
|
+
|
|
43
|
+
def find_free_port(self) -> int:
|
|
44
|
+
used_ports = {info['port'] for info in self.models.values()}
|
|
45
|
+
for port in range(BASE_PORT, BASE_PORT + 10):
|
|
46
|
+
if port not in used_ports:
|
|
47
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
48
|
+
try:
|
|
49
|
+
s.bind(('', port))
|
|
50
|
+
return port
|
|
51
|
+
except:
|
|
52
|
+
continue
|
|
53
|
+
raise Exception("No free ports")
|
|
54
|
+
|
|
55
|
+
def get_gpu_count(self) -> int:
|
|
56
|
+
try:
|
|
57
|
+
result = sp.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
|
|
58
|
+
capture_output=True, text=True)
|
|
59
|
+
if result.returncode == 0:
|
|
60
|
+
return len(result.stdout.strip().split('\n'))
|
|
61
|
+
except:
|
|
62
|
+
pass
|
|
63
|
+
return 1
|
|
64
|
+
|
|
65
|
+
def find_available_gpu(self) -> Optional[int]:
|
|
66
|
+
"""Find the next available GPU that's not heavily used"""
|
|
67
|
+
gpu_count = self.get_gpu_count()
|
|
68
|
+
if gpu_count == 1:
|
|
69
|
+
return None # Let vLLM use default
|
|
70
|
+
|
|
71
|
+
# Get GPUs used by our models
|
|
72
|
+
used_gpus = set()
|
|
73
|
+
for info in self.models.values():
|
|
74
|
+
if 'gpu_id' in info:
|
|
75
|
+
used_gpus.add(info['gpu_id'])
|
|
76
|
+
|
|
77
|
+
# Find first unused GPU
|
|
78
|
+
for gpu_id in range(gpu_count):
|
|
79
|
+
if gpu_id not in used_gpus:
|
|
80
|
+
return gpu_id
|
|
81
|
+
|
|
82
|
+
# If all GPUs have at least one model, find the least loaded
|
|
83
|
+
# For now, just cycle through
|
|
84
|
+
return len(self.models) % gpu_count
|
|
85
|
+
|
|
86
|
+
def list(self):
|
|
87
|
+
# Clean up dead processes
|
|
88
|
+
to_remove = []
|
|
89
|
+
for name, info in self.models.items():
|
|
90
|
+
if not self.is_running(info['pid']):
|
|
91
|
+
to_remove.append(name)
|
|
92
|
+
|
|
93
|
+
for name in to_remove:
|
|
94
|
+
del self.models[name]
|
|
95
|
+
|
|
96
|
+
if to_remove:
|
|
97
|
+
self.save()
|
|
98
|
+
|
|
99
|
+
return self.models
|
|
100
|
+
|
|
101
|
+
def get_tool_parser_for_model(self, model_id: str) -> tuple[str, Optional[str]]:
|
|
102
|
+
"""Determine the appropriate tool parser and chat template for a model."""
|
|
103
|
+
model_lower = model_id.lower()
|
|
104
|
+
|
|
105
|
+
# Qwen models
|
|
106
|
+
if 'qwen' in model_lower:
|
|
107
|
+
if 'qwen3-coder' in model_lower:
|
|
108
|
+
return "qwen3_coder", None # Try qwen3_coder if it exists
|
|
109
|
+
elif 'qwen2.5' in model_lower or 'qwq' in model_lower:
|
|
110
|
+
return "hermes", None # Qwen2.5 uses hermes
|
|
111
|
+
else:
|
|
112
|
+
return "hermes", None # Default for other Qwen models
|
|
113
|
+
|
|
114
|
+
# Mistral models
|
|
115
|
+
elif 'mistral' in model_lower:
|
|
116
|
+
return "mistral", "examples/tool_chat_template_mistral_parallel.jinja"
|
|
117
|
+
|
|
118
|
+
# Llama models
|
|
119
|
+
elif 'llama' in model_lower or 'meta-llama' in model_lower:
|
|
120
|
+
if 'llama-4' in model_lower:
|
|
121
|
+
return "llama4_pythonic", "examples/tool_chat_template_llama4_pythonic.jinja"
|
|
122
|
+
elif 'llama-3.2' in model_lower:
|
|
123
|
+
return "llama3_json", "examples/tool_chat_template_llama3.2_json.jinja"
|
|
124
|
+
elif 'llama-3.1' in model_lower:
|
|
125
|
+
return "llama3_json", "examples/tool_chat_template_llama3.1_json.jinja"
|
|
126
|
+
else:
|
|
127
|
+
return "llama3_json", None
|
|
128
|
+
|
|
129
|
+
# InternLM models
|
|
130
|
+
elif 'internlm' in model_lower:
|
|
131
|
+
return "internlm", "examples/tool_chat_template_internlm2_tool.jinja"
|
|
132
|
+
|
|
133
|
+
# Jamba models
|
|
134
|
+
elif 'jamba' in model_lower:
|
|
135
|
+
return "jamba", None
|
|
136
|
+
|
|
137
|
+
# Granite models
|
|
138
|
+
elif 'granite' in model_lower:
|
|
139
|
+
if 'granite-20b-functioncalling' in model_lower:
|
|
140
|
+
return "granite-20b-fc", "examples/tool_chat_template_granite_20b_fc.jinja"
|
|
141
|
+
elif 'granite-3.0' in model_lower:
|
|
142
|
+
return "granite", "examples/tool_chat_template_granite.jinja"
|
|
143
|
+
else:
|
|
144
|
+
return "granite", None
|
|
145
|
+
|
|
146
|
+
# DeepSeek models
|
|
147
|
+
elif 'deepseek' in model_lower:
|
|
148
|
+
if 'deepseek-r1' in model_lower:
|
|
149
|
+
return "deepseek_v3", "examples/tool_chat_template_deepseekr1.jinja"
|
|
150
|
+
elif 'deepseek-v3' in model_lower:
|
|
151
|
+
return "deepseek_v3", "examples/tool_chat_template_deepseekv3.jinja"
|
|
152
|
+
else:
|
|
153
|
+
return "hermes", None # Fallback for other DeepSeek models
|
|
154
|
+
|
|
155
|
+
# xLAM models
|
|
156
|
+
elif 'xlam' in model_lower:
|
|
157
|
+
if 'llama-xlam' in model_lower:
|
|
158
|
+
return "xlam", "examples/tool_chat_template_xlam_llama.jinja"
|
|
159
|
+
else:
|
|
160
|
+
return "xlam", "examples/tool_chat_template_xlam_qwen.jinja"
|
|
161
|
+
|
|
162
|
+
# Phi models (Microsoft)
|
|
163
|
+
elif 'phi' in model_lower:
|
|
164
|
+
# Phi models don't have tool calling tokens, disable by default
|
|
165
|
+
return None, None
|
|
166
|
+
|
|
167
|
+
# Default fallback
|
|
168
|
+
else:
|
|
169
|
+
return "hermes", None
|
|
170
|
+
|
|
171
|
+
def start(self, model_id: str, name: Optional[str] = None, max_len: Optional[int] = None, gpu_memory_utilization: float = None, tensor_parallel_size: int = 1, gpu_ids: Optional[str] = None):
|
|
172
|
+
# Generate name
|
|
173
|
+
if not name:
|
|
174
|
+
name = model_id.split('/')[-1].lower().replace('-', '_')
|
|
175
|
+
|
|
176
|
+
# Check if already running
|
|
177
|
+
if name in self.models and self.is_running(self.models[name]['pid']):
|
|
178
|
+
return self.models[name]
|
|
179
|
+
|
|
180
|
+
# Find port
|
|
181
|
+
port = self.find_free_port()
|
|
182
|
+
|
|
183
|
+
# Create log file
|
|
184
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
185
|
+
log_file = LOGS_DIR / f"{name}_{timestamp}.log"
|
|
186
|
+
|
|
187
|
+
# Set GPU memory utilization if not specified
|
|
188
|
+
if gpu_memory_utilization is None:
|
|
189
|
+
print("WARNING: No GPU memory utilization specified, defaulting to 90%")
|
|
190
|
+
print(" Consider specifying based on model size to run multiple models")
|
|
191
|
+
print(" Examples: 0.2 for small models, 0.5 for medium, 0.9 for large")
|
|
192
|
+
gpu_memory_utilization = 0.9
|
|
193
|
+
|
|
194
|
+
# Get appropriate tool parser for the model
|
|
195
|
+
tool_parser, chat_template = self.get_tool_parser_for_model(model_id)
|
|
196
|
+
|
|
197
|
+
# Start vLLM (use venv python if available)
|
|
198
|
+
python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
|
|
199
|
+
cmd = [
|
|
200
|
+
python_cmd, "-m", "vllm.entrypoints.openai.api_server",
|
|
201
|
+
"--model", model_id,
|
|
202
|
+
"--host", "0.0.0.0",
|
|
203
|
+
"--port", str(port),
|
|
204
|
+
"--gpu-memory-utilization", str(gpu_memory_utilization)
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
# Only add tool calling if a parser is available
|
|
208
|
+
if tool_parser:
|
|
209
|
+
print(f"Auto-detected tool parser: {tool_parser}" + (f" with chat template: {chat_template}" if chat_template else ""))
|
|
210
|
+
cmd.extend([
|
|
211
|
+
"--enable-auto-tool-choice",
|
|
212
|
+
"--tool-call-parser", tool_parser
|
|
213
|
+
])
|
|
214
|
+
# Add chat template if specified
|
|
215
|
+
if chat_template:
|
|
216
|
+
cmd.extend(["--chat-template", chat_template])
|
|
217
|
+
else:
|
|
218
|
+
print(f"Tool calling disabled for {model_id} (no compatible parser)")
|
|
219
|
+
|
|
220
|
+
# Only add max-model-len if specified
|
|
221
|
+
if max_len is not None:
|
|
222
|
+
cmd.extend(["--max-model-len", str(max_len)])
|
|
223
|
+
|
|
224
|
+
# Add tensor parallel size if > 1
|
|
225
|
+
if tensor_parallel_size > 1:
|
|
226
|
+
cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
|
|
227
|
+
|
|
228
|
+
# Use environment as-is (already configured by .pirc)
|
|
229
|
+
env = os.environ.copy()
|
|
230
|
+
|
|
231
|
+
# Handle GPU assignment
|
|
232
|
+
assigned_gpu = None
|
|
233
|
+
if tensor_parallel_size > 1:
|
|
234
|
+
# Multi-GPU: use all GPUs
|
|
235
|
+
gpu_count = self.get_gpu_count()
|
|
236
|
+
if tensor_parallel_size > gpu_count:
|
|
237
|
+
print(f"Warning: Requested {tensor_parallel_size} GPUs but only {gpu_count} available")
|
|
238
|
+
tensor_parallel_size = gpu_count
|
|
239
|
+
else:
|
|
240
|
+
# Single GPU: find available GPU
|
|
241
|
+
if gpu_ids:
|
|
242
|
+
env['CUDA_VISIBLE_DEVICES'] = gpu_ids
|
|
243
|
+
assigned_gpu = int(gpu_ids.split(',')[0])
|
|
244
|
+
else:
|
|
245
|
+
assigned_gpu = self.find_available_gpu()
|
|
246
|
+
if assigned_gpu is not None:
|
|
247
|
+
env['CUDA_VISIBLE_DEVICES'] = str(assigned_gpu)
|
|
248
|
+
print(f"Auto-assigned to GPU {assigned_gpu}")
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# Open log file and start process
|
|
252
|
+
with open(log_file, 'w') as f:
|
|
253
|
+
f.write(f"=== Starting {model_id} at {datetime.now()} ===\n")
|
|
254
|
+
f.write(f"Command: {' '.join(cmd)}\n")
|
|
255
|
+
if tool_parser:
|
|
256
|
+
f.write(f"Tool Parser: {tool_parser}\n")
|
|
257
|
+
if chat_template:
|
|
258
|
+
f.write(f"Chat Template: {chat_template}\n")
|
|
259
|
+
else:
|
|
260
|
+
f.write(f"Tool Calling: Disabled (no compatible parser)\n")
|
|
261
|
+
if gpu_ids:
|
|
262
|
+
f.write(f"CUDA_VISIBLE_DEVICES: {gpu_ids}\n")
|
|
263
|
+
if tensor_parallel_size > 1:
|
|
264
|
+
f.write(f"Tensor Parallel Size: {tensor_parallel_size}\n")
|
|
265
|
+
f.write("=" * 60 + "\n\n")
|
|
266
|
+
f.flush()
|
|
267
|
+
|
|
268
|
+
process = sp.Popen(
|
|
269
|
+
cmd,
|
|
270
|
+
stdout=f,
|
|
271
|
+
stderr=sp.STDOUT, # Merge stderr into stdout
|
|
272
|
+
bufsize=1, # Line buffered
|
|
273
|
+
universal_newlines=True,
|
|
274
|
+
env=env # Pass the modified environment
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Save info
|
|
278
|
+
self.models[name] = {
|
|
279
|
+
"pid": process.pid,
|
|
280
|
+
"port": port,
|
|
281
|
+
"model_id": model_id,
|
|
282
|
+
"log_file": str(log_file),
|
|
283
|
+
"gpu_id": assigned_gpu,
|
|
284
|
+
"tensor_parallel_size": tensor_parallel_size if tensor_parallel_size > 1 else 1
|
|
285
|
+
}
|
|
286
|
+
self.save()
|
|
287
|
+
|
|
288
|
+
return {"name": name, "port": port, "pid": process.pid, "log_file": str(log_file)}
|
|
289
|
+
|
|
290
|
+
def start_raw(self, model_id: str, name: str, vllm_args: str):
|
|
291
|
+
# Check if already running
|
|
292
|
+
if name in self.models and self.is_running(self.models[name]['pid']):
|
|
293
|
+
return self.models[name]
|
|
294
|
+
|
|
295
|
+
# Find port
|
|
296
|
+
port = self.find_free_port()
|
|
297
|
+
|
|
298
|
+
# Create log file
|
|
299
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
300
|
+
log_file = LOGS_DIR / f"{name}_{timestamp}.log"
|
|
301
|
+
|
|
302
|
+
# Start vLLM with raw arguments
|
|
303
|
+
python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
|
|
304
|
+
|
|
305
|
+
# Base command - ensure vllm_args is properly quoted
|
|
306
|
+
cmd = f'{python_cmd} -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
|
|
307
|
+
|
|
308
|
+
# Use environment as-is (already configured by .pirc)
|
|
309
|
+
env = os.environ.copy()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# Open log file and start process
|
|
313
|
+
with open(log_file, 'w') as f:
|
|
314
|
+
f.write(f"=== Starting {model_id} at {datetime.now()} ===")
|
|
315
|
+
f.write(f"\nCommand: {cmd}\n")
|
|
316
|
+
f.write("=" * 60 + "\n\n")
|
|
317
|
+
f.flush()
|
|
318
|
+
|
|
319
|
+
# Use shell=True for the command string
|
|
320
|
+
process = sp.Popen(
|
|
321
|
+
cmd,
|
|
322
|
+
shell=True,
|
|
323
|
+
stdout=f,
|
|
324
|
+
stderr=sp.STDOUT, # Merge stderr into stdout
|
|
325
|
+
bufsize=1, # Line buffered
|
|
326
|
+
universal_newlines=True,
|
|
327
|
+
env=env # Pass the modified environment
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
# Save info
|
|
331
|
+
self.models[name] = {
|
|
332
|
+
"pid": process.pid,
|
|
333
|
+
"port": port,
|
|
334
|
+
"model_id": model_id,
|
|
335
|
+
"log_file": str(log_file),
|
|
336
|
+
"raw_args": vllm_args
|
|
337
|
+
}
|
|
338
|
+
self.save()
|
|
339
|
+
|
|
340
|
+
return {"name": name, "port": port, "pid": process.pid, "log_file": str(log_file)}
|
|
341
|
+
|
|
342
|
+
def stop(self, name: str):
|
|
343
|
+
if name not in self.models:
|
|
344
|
+
return False
|
|
345
|
+
|
|
346
|
+
info = self.models[name]
|
|
347
|
+
try:
|
|
348
|
+
process = psutil.Process(info['pid'])
|
|
349
|
+
process.terminate()
|
|
350
|
+
process.wait(timeout=5)
|
|
351
|
+
except:
|
|
352
|
+
pass
|
|
353
|
+
|
|
354
|
+
del self.models[name]
|
|
355
|
+
self.save()
|
|
356
|
+
return True
|
|
357
|
+
|
|
358
|
+
def logs(self, name: str, lines: int = 50):
|
|
359
|
+
if name not in self.models:
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
log_file = self.models[name].get('log_file')
|
|
363
|
+
if not log_file or not Path(log_file).exists():
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
# Read last N lines
|
|
367
|
+
with open(log_file, 'r') as f:
|
|
368
|
+
all_lines = f.readlines()
|
|
369
|
+
return ''.join(all_lines[-lines:])
|
|
370
|
+
|
|
371
|
+
def main():
|
|
372
|
+
import sys
|
|
373
|
+
|
|
374
|
+
manager = VLLMManager()
|
|
375
|
+
|
|
376
|
+
if len(sys.argv) < 2:
|
|
377
|
+
print("Usage: vllm_manager.py [list|start|stop|logs] ...")
|
|
378
|
+
sys.exit(1)
|
|
379
|
+
|
|
380
|
+
cmd = sys.argv[1]
|
|
381
|
+
|
|
382
|
+
if cmd == "list":
|
|
383
|
+
models = manager.list()
|
|
384
|
+
if not models:
|
|
385
|
+
print("No models running")
|
|
386
|
+
else:
|
|
387
|
+
# Get external IP
|
|
388
|
+
try:
|
|
389
|
+
# Try to get IP from default interface
|
|
390
|
+
result = sp.run(['hostname', '-I'], capture_output=True, text=True)
|
|
391
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
392
|
+
host_ip = result.stdout.strip().split()[0]
|
|
393
|
+
else:
|
|
394
|
+
host_ip = socket.gethostbyname(socket.gethostname())
|
|
395
|
+
except:
|
|
396
|
+
host_ip = socket.gethostbyname(socket.gethostname())
|
|
397
|
+
print(f"Running models:")
|
|
398
|
+
for name, info in models.items():
|
|
399
|
+
print(f"\n{name}:")
|
|
400
|
+
print(f" Model: {info['model_id']}")
|
|
401
|
+
print(f" HF: https://huggingface.co/{info['model_id']}")
|
|
402
|
+
print(f" Port: {info['port']}")
|
|
403
|
+
if 'tensor_parallel_size' in info and info.get('tensor_parallel_size', 1) > 1:
|
|
404
|
+
print(f" GPUs: {info.get('tensor_parallel_size', 1)} (tensor parallel)")
|
|
405
|
+
elif 'gpu_id' in info and info['gpu_id'] is not None:
|
|
406
|
+
print(f" GPU: {info['gpu_id']}")
|
|
407
|
+
print(f" URL: http://{host_ip}:{info['port']}/v1")
|
|
408
|
+
if 'log_file' in info:
|
|
409
|
+
print(f" Logs: {info['log_file']}")
|
|
410
|
+
|
|
411
|
+
elif cmd == "start":
|
|
412
|
+
if len(sys.argv) < 3:
|
|
413
|
+
print("Usage: vllm_manager.py start <model_id> [name] [max_len] [gpu_memory] [tensor_parallel_size]")
|
|
414
|
+
sys.exit(1)
|
|
415
|
+
|
|
416
|
+
model_id = sys.argv[2]
|
|
417
|
+
name = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] not in ['""', ''] else None
|
|
418
|
+
max_len = int(sys.argv[4]) if len(sys.argv) > 4 and sys.argv[4] not in ['""', ''] else None
|
|
419
|
+
gpu_memory = float(sys.argv[5]) if len(sys.argv) > 5 else None
|
|
420
|
+
tensor_parallel = int(sys.argv[6]) if len(sys.argv) > 6 else 1
|
|
421
|
+
|
|
422
|
+
model_result = manager.start(model_id, name, max_len, gpu_memory, tensor_parallel)
|
|
423
|
+
# Get external IP
|
|
424
|
+
try:
|
|
425
|
+
# Try to get IP from default interface
|
|
426
|
+
ip_result = sp.run(['hostname', '-I'], capture_output=True, text=True)
|
|
427
|
+
if ip_result.returncode == 0 and ip_result.stdout.strip():
|
|
428
|
+
host_ip = ip_result.stdout.strip().split()[0]
|
|
429
|
+
else:
|
|
430
|
+
host_ip = socket.gethostbyname(socket.gethostname())
|
|
431
|
+
except:
|
|
432
|
+
host_ip = socket.gethostbyname(socket.gethostname())
|
|
433
|
+
|
|
434
|
+
print(f"Started {model_result['name']}")
|
|
435
|
+
print(f"URL: http://{host_ip}:{model_result['port']}/v1")
|
|
436
|
+
print(f"\nExport for OpenAI clients:")
|
|
437
|
+
print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
|
|
438
|
+
|
|
439
|
+
elif cmd == "start_raw":
|
|
440
|
+
if len(sys.argv) < 5:
|
|
441
|
+
print("Usage: vllm_manager.py start_raw <model_id> <name> <vllm_args>")
|
|
442
|
+
sys.exit(1)
|
|
443
|
+
|
|
444
|
+
model_id = sys.argv[2]
|
|
445
|
+
name = sys.argv[3]
|
|
446
|
+
vllm_args_base64 = sys.argv[4]
|
|
447
|
+
|
|
448
|
+
# Decode base64 arguments
|
|
449
|
+
vllm_args = base64.b64decode(vllm_args_base64).decode('utf-8')
|
|
450
|
+
print(f"DEBUG: Decoded vllm_args: '{vllm_args}'")
|
|
451
|
+
|
|
452
|
+
model_result = manager.start_raw(model_id, name, vllm_args)
|
|
453
|
+
# Get external IP
|
|
454
|
+
try:
|
|
455
|
+
# Try to get IP from default interface
|
|
456
|
+
ip_result = sp.run(['hostname', '-I'], capture_output=True, text=True)
|
|
457
|
+
if ip_result.returncode == 0 and ip_result.stdout.strip():
|
|
458
|
+
host_ip = ip_result.stdout.strip().split()[0]
|
|
459
|
+
else:
|
|
460
|
+
host_ip = socket.gethostbyname(socket.gethostname())
|
|
461
|
+
except:
|
|
462
|
+
host_ip = socket.gethostbyname(socket.gethostname())
|
|
463
|
+
|
|
464
|
+
print(f"Started {model_result['name']}")
|
|
465
|
+
print(f"URL: http://{host_ip}:{model_result['port']}/v1")
|
|
466
|
+
print(f"\nExport for OpenAI clients:")
|
|
467
|
+
print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
|
|
468
|
+
|
|
469
|
+
elif cmd == "stop":
|
|
470
|
+
if len(sys.argv) < 3:
|
|
471
|
+
print("Usage: vllm_manager.py stop <name>")
|
|
472
|
+
sys.exit(1)
|
|
473
|
+
|
|
474
|
+
name = sys.argv[2]
|
|
475
|
+
if manager.stop(name):
|
|
476
|
+
print(f"Stopped {name}")
|
|
477
|
+
else:
|
|
478
|
+
print(f"Model {name} not found")
|
|
479
|
+
|
|
480
|
+
elif cmd == "logs":
|
|
481
|
+
if len(sys.argv) < 3:
|
|
482
|
+
print("Usage: vllm_manager.py logs <name> [lines]")
|
|
483
|
+
sys.exit(1)
|
|
484
|
+
|
|
485
|
+
name = sys.argv[2]
|
|
486
|
+
lines = int(sys.argv[3]) if len(sys.argv) > 3 else 50
|
|
487
|
+
|
|
488
|
+
logs = manager.logs(name, lines)
|
|
489
|
+
if logs is None:
|
|
490
|
+
print(f"No logs found for {name}")
|
|
491
|
+
else:
|
|
492
|
+
print(logs, end='')
|
|
493
|
+
|
|
494
|
+
else:
|
|
495
|
+
print(f"Unknown command: {cmd}")
|
|
496
|
+
sys.exit(1)
|
|
497
|
+
|
|
498
|
+
if __name__ == "__main__":
|
|
499
|
+
main()
|