@mariozechner/pi 0.2.4 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +392 -294
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +348 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/models.d.ts +39 -0
- package/dist/commands/models.d.ts.map +1 -0
- package/dist/commands/models.js +612 -0
- package/dist/commands/models.js.map +1 -0
- package/dist/commands/pods.d.ts +21 -0
- package/dist/commands/pods.d.ts.map +1 -0
- package/dist/commands/pods.js +175 -0
- package/dist/commands/pods.js.map +1 -0
- package/dist/commands/prompt.d.ts +7 -0
- package/dist/commands/prompt.d.ts.map +1 -0
- package/dist/commands/prompt.js +55 -0
- package/dist/commands/prompt.js.map +1 -0
- package/dist/config.d.ts +11 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +74 -0
- package/dist/config.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/model-configs.d.ts +22 -0
- package/dist/model-configs.d.ts.map +1 -0
- package/dist/model-configs.js +75 -0
- package/dist/model-configs.js.map +1 -0
- package/dist/models.json +305 -0
- package/dist/ssh.d.ts +24 -0
- package/dist/ssh.d.ts.map +1 -0
- package/dist/ssh.js +115 -0
- package/dist/ssh.js.map +1 -0
- package/dist/types.d.ts +23 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +38 -40
- package/LICENSE +0 -21
- package/pi.js +0 -1379
- package/pod_setup.sh +0 -74
- package/vllm_manager.py +0 -662
package/vllm_manager.py
DELETED
|
@@ -1,662 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Simple vLLM Manager - Run multiple models on different ports
|
|
4
|
-
"""
|
|
5
|
-
|
|
6
|
-
import os
|
|
7
|
-
import json
|
|
8
|
-
import subprocess as sp
|
|
9
|
-
import psutil
|
|
10
|
-
import socket
|
|
11
|
-
import base64
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import Dict, Optional
|
|
14
|
-
from datetime import datetime
|
|
15
|
-
|
|
16
|
-
# Config
|
|
17
|
-
CONFIG_FILE = Path.home() / ".vllm_manager.json"
|
|
18
|
-
LOGS_DIR = Path.home() / ".vllm_logs"
|
|
19
|
-
BASE_PORT = 8001 # Start from 8001, leave 8000 free
|
|
20
|
-
|
|
21
|
-
class VLLMManager:
|
|
22
|
-
def __init__(self):
|
|
23
|
-
self.models = {} # name -> {pid, port, model_id, log_file}
|
|
24
|
-
self.load()
|
|
25
|
-
LOGS_DIR.mkdir(exist_ok=True)
|
|
26
|
-
|
|
27
|
-
def load(self):
|
|
28
|
-
if CONFIG_FILE.exists():
|
|
29
|
-
with open(CONFIG_FILE) as f:
|
|
30
|
-
self.models = json.load(f)
|
|
31
|
-
|
|
32
|
-
def save(self):
|
|
33
|
-
with open(CONFIG_FILE, "w") as f:
|
|
34
|
-
json.dump(self.models, f, indent=2)
|
|
35
|
-
|
|
36
|
-
def is_running(self, pid: int) -> bool:
|
|
37
|
-
try:
|
|
38
|
-
process = psutil.Process(pid)
|
|
39
|
-
return process.is_running()
|
|
40
|
-
except:
|
|
41
|
-
return False
|
|
42
|
-
|
|
43
|
-
def find_free_port(self) -> int:
|
|
44
|
-
used_ports = {info['port'] for info in self.models.values()}
|
|
45
|
-
for port in range(BASE_PORT, BASE_PORT + 10):
|
|
46
|
-
if port not in used_ports:
|
|
47
|
-
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
48
|
-
try:
|
|
49
|
-
s.bind(('', port))
|
|
50
|
-
return port
|
|
51
|
-
except:
|
|
52
|
-
continue
|
|
53
|
-
raise Exception("No free ports")
|
|
54
|
-
|
|
55
|
-
def get_gpu_count(self) -> int:
|
|
56
|
-
try:
|
|
57
|
-
result = sp.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
|
|
58
|
-
capture_output=True, text=True)
|
|
59
|
-
if result.returncode == 0:
|
|
60
|
-
return len(result.stdout.strip().split('\n'))
|
|
61
|
-
except:
|
|
62
|
-
pass
|
|
63
|
-
return 1
|
|
64
|
-
|
|
65
|
-
def find_available_gpu(self) -> Optional[int]:
|
|
66
|
-
"""Find the next available GPU that's not heavily used"""
|
|
67
|
-
gpu_count = self.get_gpu_count()
|
|
68
|
-
if gpu_count == 1:
|
|
69
|
-
return None # Let vLLM use default
|
|
70
|
-
|
|
71
|
-
# Get GPUs used by our models
|
|
72
|
-
used_gpus = set()
|
|
73
|
-
for info in self.models.values():
|
|
74
|
-
if 'gpu_id' in info:
|
|
75
|
-
used_gpus.add(info['gpu_id'])
|
|
76
|
-
|
|
77
|
-
# Find first unused GPU
|
|
78
|
-
for gpu_id in range(gpu_count):
|
|
79
|
-
if gpu_id not in used_gpus:
|
|
80
|
-
return gpu_id
|
|
81
|
-
|
|
82
|
-
# If all GPUs have at least one model, find the least loaded
|
|
83
|
-
# For now, just cycle through
|
|
84
|
-
return len(self.models) % gpu_count
|
|
85
|
-
|
|
86
|
-
def list(self):
|
|
87
|
-
# Clean up dead processes
|
|
88
|
-
to_remove = []
|
|
89
|
-
for name, info in self.models.items():
|
|
90
|
-
if not self.is_running(info['pid']):
|
|
91
|
-
to_remove.append(name)
|
|
92
|
-
|
|
93
|
-
for name in to_remove:
|
|
94
|
-
del self.models[name]
|
|
95
|
-
|
|
96
|
-
if to_remove:
|
|
97
|
-
self.save()
|
|
98
|
-
|
|
99
|
-
return self.models
|
|
100
|
-
|
|
101
|
-
def get_tool_parser_for_model(self, model_id: str) -> tuple[str, Optional[str]]:
|
|
102
|
-
"""Determine the appropriate tool parser and chat template for a model."""
|
|
103
|
-
model_lower = model_id.lower()
|
|
104
|
-
|
|
105
|
-
# Qwen models
|
|
106
|
-
if 'qwen' in model_lower:
|
|
107
|
-
if 'qwen3-coder' in model_lower:
|
|
108
|
-
return "qwen3_coder", None # Try qwen3_coder if it exists
|
|
109
|
-
elif 'qwen2.5' in model_lower or 'qwq' in model_lower:
|
|
110
|
-
return "hermes", None # Qwen2.5 uses hermes
|
|
111
|
-
else:
|
|
112
|
-
return "hermes", None # Default for other Qwen models
|
|
113
|
-
|
|
114
|
-
# Mistral models
|
|
115
|
-
elif 'mistral' in model_lower:
|
|
116
|
-
return "mistral", "examples/tool_chat_template_mistral_parallel.jinja"
|
|
117
|
-
|
|
118
|
-
# Llama models
|
|
119
|
-
elif 'llama' in model_lower or 'meta-llama' in model_lower:
|
|
120
|
-
if 'llama-4' in model_lower:
|
|
121
|
-
return "llama4_pythonic", "examples/tool_chat_template_llama4_pythonic.jinja"
|
|
122
|
-
elif 'llama-3.2' in model_lower:
|
|
123
|
-
return "llama3_json", "examples/tool_chat_template_llama3.2_json.jinja"
|
|
124
|
-
elif 'llama-3.1' in model_lower:
|
|
125
|
-
return "llama3_json", "examples/tool_chat_template_llama3.1_json.jinja"
|
|
126
|
-
else:
|
|
127
|
-
return "llama3_json", None
|
|
128
|
-
|
|
129
|
-
# InternLM models
|
|
130
|
-
elif 'internlm' in model_lower:
|
|
131
|
-
return "internlm", "examples/tool_chat_template_internlm2_tool.jinja"
|
|
132
|
-
|
|
133
|
-
# Jamba models
|
|
134
|
-
elif 'jamba' in model_lower:
|
|
135
|
-
return "jamba", None
|
|
136
|
-
|
|
137
|
-
# Granite models
|
|
138
|
-
elif 'granite' in model_lower:
|
|
139
|
-
if 'granite-20b-functioncalling' in model_lower:
|
|
140
|
-
return "granite-20b-fc", "examples/tool_chat_template_granite_20b_fc.jinja"
|
|
141
|
-
elif 'granite-3.0' in model_lower:
|
|
142
|
-
return "granite", "examples/tool_chat_template_granite.jinja"
|
|
143
|
-
else:
|
|
144
|
-
return "granite", None
|
|
145
|
-
|
|
146
|
-
# DeepSeek models
|
|
147
|
-
elif 'deepseek' in model_lower:
|
|
148
|
-
if 'deepseek-r1' in model_lower:
|
|
149
|
-
return "deepseek_v3", "examples/tool_chat_template_deepseekr1.jinja"
|
|
150
|
-
elif 'deepseek-v3' in model_lower:
|
|
151
|
-
return "deepseek_v3", "examples/tool_chat_template_deepseekv3.jinja"
|
|
152
|
-
else:
|
|
153
|
-
return "hermes", None # Fallback for other DeepSeek models
|
|
154
|
-
|
|
155
|
-
# xLAM models
|
|
156
|
-
elif 'xlam' in model_lower:
|
|
157
|
-
if 'llama-xlam' in model_lower:
|
|
158
|
-
return "xlam", "examples/tool_chat_template_xlam_llama.jinja"
|
|
159
|
-
else:
|
|
160
|
-
return "xlam", "examples/tool_chat_template_xlam_qwen.jinja"
|
|
161
|
-
|
|
162
|
-
# Phi models (Microsoft)
|
|
163
|
-
elif 'phi' in model_lower:
|
|
164
|
-
# Phi models don't have tool calling tokens, disable by default
|
|
165
|
-
return None, None
|
|
166
|
-
|
|
167
|
-
# Default fallback
|
|
168
|
-
else:
|
|
169
|
-
return "hermes", None
|
|
170
|
-
|
|
171
|
-
def start(self, model_id: str, name: Optional[str] = None, max_len: Optional[int] = None, gpu_memory_utilization: float = None, tensor_parallel_size: int = 1, gpu_ids: Optional[str] = None):
|
|
172
|
-
# Generate name
|
|
173
|
-
if not name:
|
|
174
|
-
name = model_id.split('/')[-1].lower().replace('-', '_')
|
|
175
|
-
|
|
176
|
-
# Check if already running
|
|
177
|
-
if name in self.models and self.is_running(self.models[name]['pid']):
|
|
178
|
-
return self.models[name]
|
|
179
|
-
|
|
180
|
-
# Find port
|
|
181
|
-
port = self.find_free_port()
|
|
182
|
-
|
|
183
|
-
# Create log file
|
|
184
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
185
|
-
log_file = LOGS_DIR / f"{name}_{timestamp}.log"
|
|
186
|
-
|
|
187
|
-
# Set GPU memory utilization if not specified
|
|
188
|
-
if gpu_memory_utilization is None:
|
|
189
|
-
print("WARNING: No GPU memory utilization specified, defaulting to 90%")
|
|
190
|
-
print(" Consider specifying based on model size to run multiple models")
|
|
191
|
-
print(" Examples: 0.2 for small models, 0.5 for medium, 0.9 for large")
|
|
192
|
-
gpu_memory_utilization = 0.9
|
|
193
|
-
|
|
194
|
-
# Get appropriate tool parser for the model
|
|
195
|
-
tool_parser, chat_template = self.get_tool_parser_for_model(model_id)
|
|
196
|
-
|
|
197
|
-
# Start vLLM (use venv python if available)
|
|
198
|
-
python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
|
|
199
|
-
cmd = [
|
|
200
|
-
python_cmd, "-u", "-m", "vllm.entrypoints.openai.api_server",
|
|
201
|
-
"--model", model_id,
|
|
202
|
-
"--host", "0.0.0.0",
|
|
203
|
-
"--port", str(port),
|
|
204
|
-
"--gpu-memory-utilization", str(gpu_memory_utilization)
|
|
205
|
-
]
|
|
206
|
-
|
|
207
|
-
# Only add tool calling if a parser is available
|
|
208
|
-
if tool_parser:
|
|
209
|
-
print(f"Auto-detected tool parser: {tool_parser}" + (f" with chat template: {chat_template}" if chat_template else ""))
|
|
210
|
-
cmd.extend([
|
|
211
|
-
"--enable-auto-tool-choice",
|
|
212
|
-
"--tool-call-parser", tool_parser
|
|
213
|
-
])
|
|
214
|
-
# Add chat template if specified
|
|
215
|
-
if chat_template:
|
|
216
|
-
cmd.extend(["--chat-template", chat_template])
|
|
217
|
-
else:
|
|
218
|
-
print(f"Tool calling disabled for {model_id} (no compatible parser)")
|
|
219
|
-
|
|
220
|
-
# Only add max-model-len if specified
|
|
221
|
-
if max_len is not None:
|
|
222
|
-
cmd.extend(["--max-model-len", str(max_len)])
|
|
223
|
-
|
|
224
|
-
# Add tensor parallel size if > 1
|
|
225
|
-
if tensor_parallel_size > 1:
|
|
226
|
-
cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
|
|
227
|
-
|
|
228
|
-
# Use environment as-is (already configured by .pirc)
|
|
229
|
-
env = os.environ.copy()
|
|
230
|
-
|
|
231
|
-
# Handle GPU assignment
|
|
232
|
-
assigned_gpu = None
|
|
233
|
-
if tensor_parallel_size > 1:
|
|
234
|
-
# Multi-GPU: use all GPUs
|
|
235
|
-
gpu_count = self.get_gpu_count()
|
|
236
|
-
if tensor_parallel_size > gpu_count:
|
|
237
|
-
print(f"Warning: Requested {tensor_parallel_size} GPUs but only {gpu_count} available")
|
|
238
|
-
tensor_parallel_size = gpu_count
|
|
239
|
-
else:
|
|
240
|
-
# Single GPU: find available GPU
|
|
241
|
-
if gpu_ids:
|
|
242
|
-
env['CUDA_VISIBLE_DEVICES'] = gpu_ids
|
|
243
|
-
assigned_gpu = int(gpu_ids.split(',')[0])
|
|
244
|
-
else:
|
|
245
|
-
assigned_gpu = self.find_available_gpu()
|
|
246
|
-
if assigned_gpu is not None:
|
|
247
|
-
env['CUDA_VISIBLE_DEVICES'] = str(assigned_gpu)
|
|
248
|
-
print(f"Auto-assigned to GPU {assigned_gpu}")
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
# Open log file and start process
|
|
252
|
-
with open(log_file, 'w') as f:
|
|
253
|
-
f.write(f"=== Starting {model_id} at {datetime.now()} ===\n")
|
|
254
|
-
f.write(f"Command: {' '.join(cmd)}\n")
|
|
255
|
-
if tool_parser:
|
|
256
|
-
f.write(f"Tool Parser: {tool_parser}\n")
|
|
257
|
-
if chat_template:
|
|
258
|
-
f.write(f"Chat Template: {chat_template}\n")
|
|
259
|
-
else:
|
|
260
|
-
f.write(f"Tool Calling: Disabled (no compatible parser)\n")
|
|
261
|
-
if gpu_ids:
|
|
262
|
-
f.write(f"CUDA_VISIBLE_DEVICES: {gpu_ids}\n")
|
|
263
|
-
if tensor_parallel_size > 1:
|
|
264
|
-
f.write(f"Tensor Parallel Size: {tensor_parallel_size}\n")
|
|
265
|
-
f.write("=" * 60 + "\n\n")
|
|
266
|
-
f.flush()
|
|
267
|
-
|
|
268
|
-
process = sp.Popen(
|
|
269
|
-
cmd,
|
|
270
|
-
stdout=f,
|
|
271
|
-
stderr=sp.STDOUT, # Merge stderr into stdout
|
|
272
|
-
bufsize=1, # Line buffered
|
|
273
|
-
universal_newlines=True,
|
|
274
|
-
env=env # Pass the modified environment
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
# Save info
|
|
278
|
-
self.models[name] = {
|
|
279
|
-
"pid": process.pid,
|
|
280
|
-
"port": port,
|
|
281
|
-
"model_id": model_id,
|
|
282
|
-
"log_file": str(log_file),
|
|
283
|
-
"gpu_id": assigned_gpu,
|
|
284
|
-
"tensor_parallel_size": tensor_parallel_size if tensor_parallel_size > 1 else 1
|
|
285
|
-
}
|
|
286
|
-
self.save()
|
|
287
|
-
|
|
288
|
-
return {"name": name, "port": port, "pid": process.pid, "log_file": str(log_file)}
|
|
289
|
-
|
|
290
|
-
def start_raw(self, model_id: str, name: str, vllm_args: str):
|
|
291
|
-
# Check if already running
|
|
292
|
-
if name in self.models and self.is_running(self.models[name]['pid']):
|
|
293
|
-
return self.models[name]
|
|
294
|
-
|
|
295
|
-
# Find port
|
|
296
|
-
port = self.find_free_port()
|
|
297
|
-
|
|
298
|
-
# Create log file
|
|
299
|
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
300
|
-
log_file = LOGS_DIR / f"{name}_{timestamp}.log"
|
|
301
|
-
|
|
302
|
-
# Start vLLM with raw arguments
|
|
303
|
-
python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
|
|
304
|
-
|
|
305
|
-
# Base command - ensure vllm_args is properly quoted
|
|
306
|
-
cmd = f'{python_cmd} -u -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
|
|
307
|
-
|
|
308
|
-
# Use environment as-is (already configured by .pirc)
|
|
309
|
-
env = os.environ.copy()
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
# Open log file and start process
|
|
313
|
-
with open(log_file, 'w') as f:
|
|
314
|
-
f.write(f"=== Starting {model_id} at {datetime.now()} ===")
|
|
315
|
-
f.write(f"\nCommand: {cmd}\n")
|
|
316
|
-
f.write("=" * 60 + "\n\n")
|
|
317
|
-
f.flush()
|
|
318
|
-
|
|
319
|
-
# Use shell=True for the command string
|
|
320
|
-
process = sp.Popen(
|
|
321
|
-
cmd,
|
|
322
|
-
shell=True,
|
|
323
|
-
stdout=f,
|
|
324
|
-
stderr=sp.STDOUT, # Merge stderr into stdout
|
|
325
|
-
bufsize=1, # Line buffered
|
|
326
|
-
universal_newlines=True,
|
|
327
|
-
env=env # Pass the modified environment
|
|
328
|
-
)
|
|
329
|
-
|
|
330
|
-
# Save info
|
|
331
|
-
self.models[name] = {
|
|
332
|
-
"pid": process.pid,
|
|
333
|
-
"port": port,
|
|
334
|
-
"model_id": model_id,
|
|
335
|
-
"log_file": str(log_file),
|
|
336
|
-
"raw_args": vllm_args
|
|
337
|
-
}
|
|
338
|
-
self.save()
|
|
339
|
-
|
|
340
|
-
return {"name": name, "port": port, "pid": process.pid, "log_file": str(log_file)}
|
|
341
|
-
|
|
342
|
-
def stop(self, name: str):
|
|
343
|
-
if name not in self.models:
|
|
344
|
-
return False
|
|
345
|
-
|
|
346
|
-
info = self.models[name]
|
|
347
|
-
try:
|
|
348
|
-
process = psutil.Process(info['pid'])
|
|
349
|
-
process.terminate()
|
|
350
|
-
process.wait(timeout=5)
|
|
351
|
-
except:
|
|
352
|
-
pass
|
|
353
|
-
|
|
354
|
-
# Force kill all vLLM-related Python processes to ensure cleanup
|
|
355
|
-
max_attempts = 5
|
|
356
|
-
for attempt in range(max_attempts):
|
|
357
|
-
try:
|
|
358
|
-
# Get all python processes containing 'vllm'
|
|
359
|
-
ps_result = sp.run(['ps', 'aux'], capture_output=True, text=True)
|
|
360
|
-
vllm_pids = []
|
|
361
|
-
|
|
362
|
-
for line in ps_result.stdout.split('\n'):
|
|
363
|
-
if 'python' in line and 'vllm' in line and 'vllm_manager.py' not in line:
|
|
364
|
-
# Extract PID (second column)
|
|
365
|
-
parts = line.split()
|
|
366
|
-
if len(parts) > 1:
|
|
367
|
-
vllm_pids.append(parts[1])
|
|
368
|
-
|
|
369
|
-
if not vllm_pids:
|
|
370
|
-
break # No vLLM processes found
|
|
371
|
-
|
|
372
|
-
# Kill the vLLM processes
|
|
373
|
-
for pid in vllm_pids:
|
|
374
|
-
try:
|
|
375
|
-
sp.run(['kill', '-9', pid], capture_output=True)
|
|
376
|
-
except:
|
|
377
|
-
pass
|
|
378
|
-
|
|
379
|
-
# Small delay between attempts
|
|
380
|
-
import time
|
|
381
|
-
time.sleep(0.5)
|
|
382
|
-
except:
|
|
383
|
-
break
|
|
384
|
-
|
|
385
|
-
del self.models[name]
|
|
386
|
-
self.save()
|
|
387
|
-
return True
|
|
388
|
-
|
|
389
|
-
def logs(self, name: str, lines: int = 50):
|
|
390
|
-
if name not in self.models:
|
|
391
|
-
return None
|
|
392
|
-
|
|
393
|
-
log_file = self.models[name].get('log_file')
|
|
394
|
-
if not log_file or not Path(log_file).exists():
|
|
395
|
-
return None
|
|
396
|
-
|
|
397
|
-
# Read last N lines
|
|
398
|
-
with open(log_file, 'r') as f:
|
|
399
|
-
all_lines = f.readlines()
|
|
400
|
-
return ''.join(all_lines[-lines:])
|
|
401
|
-
|
|
402
|
-
def check_downloads(self):
|
|
403
|
-
"""Check model download progress in HuggingFace cache"""
|
|
404
|
-
import glob
|
|
405
|
-
import re
|
|
406
|
-
|
|
407
|
-
# Respect HuggingFace environment variables
|
|
408
|
-
if os.environ.get('HUGGINGFACE_HUB_CACHE'):
|
|
409
|
-
cache_dir = Path(os.environ['HUGGINGFACE_HUB_CACHE'])
|
|
410
|
-
elif os.environ.get('HF_HOME'):
|
|
411
|
-
cache_dir = Path(os.environ['HF_HOME']) / "hub"
|
|
412
|
-
else:
|
|
413
|
-
cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
|
|
414
|
-
|
|
415
|
-
if not cache_dir.exists():
|
|
416
|
-
return {"status": "NO_CACHE", "cache_dir": str(cache_dir)}
|
|
417
|
-
|
|
418
|
-
model_dirs = list(cache_dir.glob("models--*"))
|
|
419
|
-
if not model_dirs:
|
|
420
|
-
return {"status": "NO_MODELS"}
|
|
421
|
-
|
|
422
|
-
results = []
|
|
423
|
-
|
|
424
|
-
for model_dir in model_dirs:
|
|
425
|
-
# Extract model name
|
|
426
|
-
model_name = model_dir.name.replace("models--", "").replace("--", "/")
|
|
427
|
-
|
|
428
|
-
# Get size (only count actual blob files, not symlinks)
|
|
429
|
-
total_size = 0
|
|
430
|
-
blobs_dir = model_dir / "blobs"
|
|
431
|
-
if blobs_dir.exists():
|
|
432
|
-
for f in blobs_dir.iterdir():
|
|
433
|
-
if f.is_file() and not f.name.endswith('.incomplete'):
|
|
434
|
-
total_size += f.stat().st_size
|
|
435
|
-
size_gb = total_size / (1024**3)
|
|
436
|
-
|
|
437
|
-
# Count safetensors files in blobs directory (actual files)
|
|
438
|
-
safetensors_count = 0
|
|
439
|
-
snapshots_dir = model_dir / "snapshots"
|
|
440
|
-
if snapshots_dir.exists():
|
|
441
|
-
for snapshot in snapshots_dir.iterdir():
|
|
442
|
-
if snapshot.is_dir():
|
|
443
|
-
safetensors_count = len(list(snapshot.glob("*.safetensors")))
|
|
444
|
-
break # Use first snapshot
|
|
445
|
-
file_count = safetensors_count
|
|
446
|
-
|
|
447
|
-
# Get total expected files from filename pattern
|
|
448
|
-
total_files = 0
|
|
449
|
-
if snapshots_dir.exists():
|
|
450
|
-
for snapshot in snapshots_dir.iterdir():
|
|
451
|
-
if snapshot.is_dir():
|
|
452
|
-
for f in snapshot.glob("*.safetensors"):
|
|
453
|
-
match = re.search(r'model-\d+-of-(\d+)\.safetensors', f.name)
|
|
454
|
-
if match:
|
|
455
|
-
total_files = max(total_files, int(match.group(1)))
|
|
456
|
-
break # Only check first snapshot
|
|
457
|
-
|
|
458
|
-
# Check if actively downloading (check if any incomplete files exist in blobs)
|
|
459
|
-
incomplete_files = []
|
|
460
|
-
if blobs_dir.exists():
|
|
461
|
-
incomplete_files = list(blobs_dir.glob("*.incomplete"))
|
|
462
|
-
is_active = len(incomplete_files) > 0
|
|
463
|
-
|
|
464
|
-
results.append({
|
|
465
|
-
"model": model_name,
|
|
466
|
-
"size_gb": round(size_gb, 1),
|
|
467
|
-
"files": file_count,
|
|
468
|
-
"total_files": total_files,
|
|
469
|
-
"active": is_active
|
|
470
|
-
})
|
|
471
|
-
|
|
472
|
-
# Count vLLM processes
|
|
473
|
-
vllm_count = 0
|
|
474
|
-
for proc in psutil.process_iter(['pid', 'cmdline']):
|
|
475
|
-
try:
|
|
476
|
-
cmdline = ' '.join(proc.info['cmdline'] or [])
|
|
477
|
-
if 'python' in cmdline and 'vllm' in cmdline and 'vllm_manager.py' not in cmdline:
|
|
478
|
-
vllm_count += 1
|
|
479
|
-
except:
|
|
480
|
-
pass
|
|
481
|
-
|
|
482
|
-
return {
|
|
483
|
-
"status": "OK",
|
|
484
|
-
"models": results,
|
|
485
|
-
"vllm_processes": vllm_count
|
|
486
|
-
}
|
|
487
|
-
|
|
488
|
-
def main():
|
|
489
|
-
import sys
|
|
490
|
-
|
|
491
|
-
manager = VLLMManager()
|
|
492
|
-
|
|
493
|
-
if len(sys.argv) < 2:
|
|
494
|
-
print("Usage: vllm_manager.py [list|start|stop|logs|downloads] ...")
|
|
495
|
-
sys.exit(1)
|
|
496
|
-
|
|
497
|
-
cmd = sys.argv[1]
|
|
498
|
-
|
|
499
|
-
if cmd == "list":
|
|
500
|
-
models = manager.list()
|
|
501
|
-
if not models:
|
|
502
|
-
print("No models running")
|
|
503
|
-
else:
|
|
504
|
-
# Get external IP
|
|
505
|
-
try:
|
|
506
|
-
# Try to get IP from default interface
|
|
507
|
-
result = sp.run(['hostname', '-I'], capture_output=True, text=True)
|
|
508
|
-
if result.returncode == 0 and result.stdout.strip():
|
|
509
|
-
host_ip = result.stdout.strip().split()[0]
|
|
510
|
-
else:
|
|
511
|
-
host_ip = socket.gethostbyname(socket.gethostname())
|
|
512
|
-
except:
|
|
513
|
-
host_ip = socket.gethostbyname(socket.gethostname())
|
|
514
|
-
print(f"Running models:")
|
|
515
|
-
for name, info in models.items():
|
|
516
|
-
print(f"\n{name}:")
|
|
517
|
-
print(f" Model: {info['model_id']}")
|
|
518
|
-
print(f" HF: https://huggingface.co/{info['model_id']}")
|
|
519
|
-
print(f" Port: {info['port']}")
|
|
520
|
-
if 'tensor_parallel_size' in info and info.get('tensor_parallel_size', 1) > 1:
|
|
521
|
-
print(f" GPUs: {info.get('tensor_parallel_size', 1)} (tensor parallel)")
|
|
522
|
-
elif 'gpu_id' in info and info['gpu_id'] is not None:
|
|
523
|
-
print(f" GPU: {info['gpu_id']}")
|
|
524
|
-
print(f" URL: http://{host_ip}:{info['port']}/v1")
|
|
525
|
-
print(f"\n Export for OpenAI clients:")
|
|
526
|
-
print(f" export OPENAI_BASE_URL='http://{host_ip}:{info['port']}/v1'")
|
|
527
|
-
print(f" export OPENAI_API_KEY='dummy'")
|
|
528
|
-
print(f" export OPENAI_MODEL='{info['model_id']}'")
|
|
529
|
-
if 'log_file' in info:
|
|
530
|
-
print(f"\n Logs: {info['log_file']}")
|
|
531
|
-
|
|
532
|
-
elif cmd == "start":
|
|
533
|
-
if len(sys.argv) < 3:
|
|
534
|
-
print("Usage: vllm_manager.py start <model_id> [name] [max_len] [gpu_memory] [tensor_parallel_size]")
|
|
535
|
-
sys.exit(1)
|
|
536
|
-
|
|
537
|
-
model_id = sys.argv[2]
|
|
538
|
-
name = sys.argv[3] if len(sys.argv) > 3 and sys.argv[3] not in ['""', ''] else None
|
|
539
|
-
max_len = int(sys.argv[4]) if len(sys.argv) > 4 and sys.argv[4] not in ['""', ''] else None
|
|
540
|
-
gpu_memory = float(sys.argv[5]) if len(sys.argv) > 5 else None
|
|
541
|
-
tensor_parallel = int(sys.argv[6]) if len(sys.argv) > 6 else 1
|
|
542
|
-
|
|
543
|
-
model_result = manager.start(model_id, name, max_len, gpu_memory, tensor_parallel)
|
|
544
|
-
# Get external IP
|
|
545
|
-
try:
|
|
546
|
-
# Try to get IP from default interface
|
|
547
|
-
ip_result = sp.run(['hostname', '-I'], capture_output=True, text=True)
|
|
548
|
-
if ip_result.returncode == 0 and ip_result.stdout.strip():
|
|
549
|
-
host_ip = ip_result.stdout.strip().split()[0]
|
|
550
|
-
else:
|
|
551
|
-
host_ip = socket.gethostbyname(socket.gethostname())
|
|
552
|
-
except:
|
|
553
|
-
host_ip = socket.gethostbyname(socket.gethostname())
|
|
554
|
-
|
|
555
|
-
print(f"Started {model_result['name']}")
|
|
556
|
-
print(f"URL: http://{host_ip}:{model_result['port']}/v1")
|
|
557
|
-
print(f"\nExport for OpenAI clients:")
|
|
558
|
-
print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
|
|
559
|
-
print(f"export OPENAI_MODEL='{model_id}'")
|
|
560
|
-
|
|
561
|
-
elif cmd == "start_raw":
|
|
562
|
-
if len(sys.argv) < 5:
|
|
563
|
-
print("Usage: vllm_manager.py start_raw <model_id> <name> <vllm_args>")
|
|
564
|
-
sys.exit(1)
|
|
565
|
-
|
|
566
|
-
model_id = sys.argv[2]
|
|
567
|
-
name = sys.argv[3]
|
|
568
|
-
vllm_args_base64 = sys.argv[4]
|
|
569
|
-
|
|
570
|
-
# Decode base64 arguments
|
|
571
|
-
vllm_args = base64.b64decode(vllm_args_base64).decode('utf-8')
|
|
572
|
-
print(f"DEBUG: Decoded vllm_args: '{vllm_args}'")
|
|
573
|
-
|
|
574
|
-
model_result = manager.start_raw(model_id, name, vllm_args)
|
|
575
|
-
# Get external IP
|
|
576
|
-
try:
|
|
577
|
-
# Try to get IP from default interface
|
|
578
|
-
ip_result = sp.run(['hostname', '-I'], capture_output=True, text=True)
|
|
579
|
-
if ip_result.returncode == 0 and ip_result.stdout.strip():
|
|
580
|
-
host_ip = ip_result.stdout.strip().split()[0]
|
|
581
|
-
else:
|
|
582
|
-
host_ip = socket.gethostbyname(socket.gethostname())
|
|
583
|
-
except:
|
|
584
|
-
host_ip = socket.gethostbyname(socket.gethostname())
|
|
585
|
-
|
|
586
|
-
print(f"Started {model_result['name']}")
|
|
587
|
-
print(f"URL: http://{host_ip}:{model_result['port']}/v1")
|
|
588
|
-
print(f"\nExport for OpenAI clients:")
|
|
589
|
-
print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
|
|
590
|
-
print(f"export OPENAI_MODEL='{model_id}'")
|
|
591
|
-
|
|
592
|
-
elif cmd == "stop":
|
|
593
|
-
if len(sys.argv) < 3:
|
|
594
|
-
print("Usage: vllm_manager.py stop <name>")
|
|
595
|
-
sys.exit(1)
|
|
596
|
-
|
|
597
|
-
name = sys.argv[2]
|
|
598
|
-
if manager.stop(name):
|
|
599
|
-
print(f"Stopped {name}")
|
|
600
|
-
else:
|
|
601
|
-
print(f"Model {name} not found")
|
|
602
|
-
|
|
603
|
-
elif cmd == "logs":
|
|
604
|
-
if len(sys.argv) < 3:
|
|
605
|
-
print("Usage: vllm_manager.py logs <name> [lines]")
|
|
606
|
-
sys.exit(1)
|
|
607
|
-
|
|
608
|
-
name = sys.argv[2]
|
|
609
|
-
lines = int(sys.argv[3]) if len(sys.argv) > 3 else 50
|
|
610
|
-
|
|
611
|
-
logs = manager.logs(name, lines)
|
|
612
|
-
if logs is None:
|
|
613
|
-
print(f"No logs found for {name}")
|
|
614
|
-
else:
|
|
615
|
-
print(logs, end='')
|
|
616
|
-
|
|
617
|
-
elif cmd == "downloads":
|
|
618
|
-
# Check if --stream flag is provided
|
|
619
|
-
stream = len(sys.argv) > 2 and sys.argv[2] == "--stream"
|
|
620
|
-
|
|
621
|
-
if stream:
|
|
622
|
-
# Streaming mode - continuously output status
|
|
623
|
-
import time
|
|
624
|
-
import signal
|
|
625
|
-
|
|
626
|
-
# Handle SIGTERM/SIGINT for clean shutdown
|
|
627
|
-
def signal_handler(sig, frame):
|
|
628
|
-
sys.exit(0)
|
|
629
|
-
|
|
630
|
-
signal.signal(signal.SIGINT, signal_handler)
|
|
631
|
-
signal.signal(signal.SIGTERM, signal_handler)
|
|
632
|
-
|
|
633
|
-
while True:
|
|
634
|
-
download_info = manager.check_downloads()
|
|
635
|
-
|
|
636
|
-
if download_info["status"] == "NO_CACHE":
|
|
637
|
-
print(json.dumps({"status": "NO_CACHE", "message": "No HuggingFace cache found"}))
|
|
638
|
-
elif download_info["status"] == "NO_MODELS":
|
|
639
|
-
print(json.dumps({"status": "NO_MODELS", "message": "No models in cache"}))
|
|
640
|
-
else:
|
|
641
|
-
print(json.dumps(download_info))
|
|
642
|
-
|
|
643
|
-
sys.stdout.flush() # Force flush to ensure output is sent
|
|
644
|
-
time.sleep(2) # Update every 2 seconds
|
|
645
|
-
else:
|
|
646
|
-
# Single check mode
|
|
647
|
-
download_info = manager.check_downloads()
|
|
648
|
-
|
|
649
|
-
if download_info["status"] == "NO_CACHE":
|
|
650
|
-
print("No HuggingFace cache found")
|
|
651
|
-
elif download_info["status"] == "NO_MODELS":
|
|
652
|
-
print("No models in cache")
|
|
653
|
-
else:
|
|
654
|
-
# Output as JSON for easy parsing
|
|
655
|
-
print(json.dumps(download_info))
|
|
656
|
-
|
|
657
|
-
else:
|
|
658
|
-
print(f"Unknown command: {cmd}")
|
|
659
|
-
sys.exit(1)
|
|
660
|
-
|
|
661
|
-
if __name__ == "__main__":
|
|
662
|
-
main()
|