@mariozechner/pi 0.1.5 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +98 -2
- package/package.json +1 -1
- package/pi.js +576 -75
- package/pod_setup.sh +55 -114
- package/vllm_manager.py +167 -4
package/pod_setup.sh
CHANGED
|
@@ -1,133 +1,74 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
#
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# GPU pod bootstrap: Ubuntu 22.04 + CUDA 12.6/12.8, vLLM latest, FlashInfer w/ TRT kernels (sm70-120)
|
|
3
3
|
|
|
4
|
-
set -
|
|
4
|
+
set -euo pipefail
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
apt update -y
|
|
7
|
+
apt install -y python3-pip python3-venv git build-essential cmake ninja-build curl
|
|
7
8
|
|
|
8
|
-
#
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
# --- Install uv (fast Python package manager) --------------------------------
|
|
10
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
11
|
+
export PATH="$HOME/.local/bin:$PATH"
|
|
11
12
|
|
|
12
|
-
# Create
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
13
|
+
# --- Create and activate venv ------------------------------------------------
|
|
14
|
+
VENV="$HOME/vllm_env"
|
|
15
|
+
uv venv --python 3.12 --seed "$VENV"
|
|
16
|
+
source "$VENV/bin/activate"
|
|
16
17
|
|
|
17
|
-
#
|
|
18
|
-
|
|
18
|
+
# --- Install vLLM with automatic PyTorch selection ---------------------------
|
|
19
|
+
echo "Installing vLLM with automatic CUDA/PyTorch detection..."
|
|
20
|
+
# uv automatically selects the right PyTorch based on CUDA version
|
|
21
|
+
uv pip install vllm --torch-backend=auto
|
|
19
22
|
|
|
20
|
-
#
|
|
21
|
-
|
|
23
|
+
# --- Install additional packages ---------------------------------------------
|
|
24
|
+
echo "Installing additional packages..."
|
|
25
|
+
uv pip install huggingface-hub psutil tensorrt hf_transfer
|
|
22
26
|
|
|
23
|
-
#
|
|
24
|
-
echo "
|
|
27
|
+
# --- FlashInfer installation (optional, improves performance) ----------------
|
|
28
|
+
echo "Attempting FlashInfer installation (optional)..."
|
|
29
|
+
# vLLM will use Flash Attention as fallback if FlashInfer is not available
|
|
25
30
|
|
|
26
|
-
#
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
echo "Detected CUDA version from nvidia-smi: $CUDA_VERSION"
|
|
31
|
-
elif command -v nvcc &> /dev/null; then
|
|
32
|
-
CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
|
|
33
|
-
echo "Detected CUDA version from nvcc: $CUDA_VERSION"
|
|
31
|
+
# Try the official FlashInfer package name
|
|
32
|
+
if uv pip install flashinfer-python; then
|
|
33
|
+
echo "FlashInfer installed successfully"
|
|
34
|
+
ATTENTION_BACKEND="FLASHINFER"
|
|
34
35
|
else
|
|
35
|
-
|
|
36
|
+
echo "FlashInfer not available, using Flash Attention instead"
|
|
37
|
+
ATTENTION_BACKEND="FLASH_ATTN"
|
|
36
38
|
fi
|
|
37
39
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
case "$CUDA_VERSION" in
|
|
41
|
-
12.8*)
|
|
42
|
-
echo "Installing PyTorch with CUDA 12.8 support"
|
|
43
|
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
|
44
|
-
;;
|
|
45
|
-
12.7*)
|
|
46
|
-
echo "Installing PyTorch with CUDA 12.7 support"
|
|
47
|
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu127
|
|
48
|
-
;;
|
|
49
|
-
12.6*)
|
|
50
|
-
echo "Installing PyTorch with CUDA 12.6 support"
|
|
51
|
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
|
|
52
|
-
;;
|
|
53
|
-
12.4*)
|
|
54
|
-
echo "Installing PyTorch with CUDA 12.4 support"
|
|
55
|
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
|
|
56
|
-
;;
|
|
57
|
-
12.1*)
|
|
58
|
-
echo "Installing PyTorch with CUDA 12.1 support"
|
|
59
|
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
|
60
|
-
;;
|
|
61
|
-
11.8*)
|
|
62
|
-
echo "Installing PyTorch with CUDA 11.8 support"
|
|
63
|
-
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
64
|
-
;;
|
|
65
|
-
*)
|
|
66
|
-
echo "CUDA $CUDA_VERSION detected - using default PyTorch (may not be optimal)"
|
|
67
|
-
pip install torch torchvision torchaudio
|
|
68
|
-
;;
|
|
69
|
-
esac
|
|
70
|
-
else
|
|
71
|
-
echo "WARNING: nvcc not found, installing default PyTorch"
|
|
72
|
-
pip install torch torchvision torchaudio
|
|
73
|
-
fi
|
|
74
|
-
|
|
75
|
-
pip install vllm huggingface-hub psutil
|
|
76
|
-
|
|
77
|
-
# Install FlashInfer for better performance (~15% sampler latency reduction)
|
|
78
|
-
echo "Installing FlashInfer for performance optimization..."
|
|
79
|
-
echo "Building FlashInfer from source..."
|
|
80
|
-
|
|
81
|
-
# Clone and build FlashInfer from source
|
|
82
|
-
cd /tmp
|
|
83
|
-
if [ -d "flashinfer" ]; then
|
|
84
|
-
rm -rf flashinfer
|
|
85
|
-
fi
|
|
86
|
-
|
|
87
|
-
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
|
|
88
|
-
cd flashinfer
|
|
89
|
-
|
|
90
|
-
# Install from source
|
|
91
|
-
if python -m pip install -v .; then
|
|
92
|
-
echo "FlashInfer successfully built from source"
|
|
93
|
-
else
|
|
94
|
-
echo "FlashInfer installation failed (optional)"
|
|
95
|
-
fi
|
|
40
|
+
# --- HF token check ----------------------------------------------------------
|
|
41
|
+
: "${HF_TOKEN:?HF_TOKEN env var required}"
|
|
96
42
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
rm -rf /tmp/flashinfer
|
|
43
|
+
mkdir -p ~/.config/vllm
|
|
44
|
+
touch ~/.config/vllm/do_not_track
|
|
100
45
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
fi
|
|
107
|
-
|
|
108
|
-
# Create directory for vLLM config
|
|
109
|
-
mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
|
|
110
|
-
|
|
111
|
-
# Create .pirc file for consistent environment
|
|
112
|
-
cat > ~/.pirc << EOF
|
|
113
|
-
# Prime Intellect CLI environment
|
|
114
|
-
# This file is sourced by all pi commands
|
|
115
|
-
|
|
116
|
-
# Activate vLLM virtual environment if it exists
|
|
117
|
-
if [ -d "\$HOME/vllm_env" ]; then
|
|
118
|
-
source "\$HOME/vllm_env/bin/activate"
|
|
119
|
-
fi
|
|
120
|
-
|
|
121
|
-
# Performance optimizations
|
|
46
|
+
cat > ~/.pirc <<EOF
|
|
47
|
+
# auto-sourced env
|
|
48
|
+
[ -d "$HOME/vllm_env" ] && source "$HOME/vllm_env/bin/activate"
|
|
49
|
+
export PATH="$HOME/.local/bin:$PATH"
|
|
50
|
+
export VLLM_ATTENTION_BACKEND=${ATTENTION_BACKEND}
|
|
122
51
|
export VLLM_USE_FLASHINFER_SAMPLER=1
|
|
123
52
|
export VLLM_USE_DEEP_GEMM=1
|
|
124
53
|
export VLLM_NO_USAGE_STATS=1
|
|
125
54
|
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
export HF_TOKEN
|
|
129
|
-
export
|
|
55
|
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
|
56
|
+
export HF_TOKEN=${HF_TOKEN}
|
|
57
|
+
export HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
|
|
58
|
+
export HF_HUB_ENABLE_HF_TRANSFER=1
|
|
130
59
|
EOF
|
|
131
60
|
|
|
132
|
-
#
|
|
133
|
-
|
|
61
|
+
# --- RunPod specific setup ---------------------------------------------------
|
|
62
|
+
if df -h | grep -q "runpod.net.*workspace"; then
|
|
63
|
+
echo "Detected RunPod instance - setting up workspace symlink..."
|
|
64
|
+
if [ ! -L ~/.cache/huggingface ]; then
|
|
65
|
+
mkdir -p /workspace/cache/huggingface
|
|
66
|
+
rm -rf ~/.cache/huggingface 2>/dev/null || true
|
|
67
|
+
ln -s /workspace/cache/huggingface ~/.cache/huggingface
|
|
68
|
+
echo "Created symlink: ~/.cache/huggingface -> /workspace/cache/huggingface"
|
|
69
|
+
else
|
|
70
|
+
echo "Symlink already exists"
|
|
71
|
+
fi
|
|
72
|
+
fi
|
|
73
|
+
|
|
74
|
+
echo "=== DONE ==="
|
package/vllm_manager.py
CHANGED
|
@@ -197,7 +197,7 @@ class VLLMManager:
|
|
|
197
197
|
# Start vLLM (use venv python if available)
|
|
198
198
|
python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
|
|
199
199
|
cmd = [
|
|
200
|
-
python_cmd, "-m", "vllm.entrypoints.openai.api_server",
|
|
200
|
+
python_cmd, "-u", "-m", "vllm.entrypoints.openai.api_server",
|
|
201
201
|
"--model", model_id,
|
|
202
202
|
"--host", "0.0.0.0",
|
|
203
203
|
"--port", str(port),
|
|
@@ -303,7 +303,7 @@ class VLLMManager:
|
|
|
303
303
|
python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
|
|
304
304
|
|
|
305
305
|
# Base command - ensure vllm_args is properly quoted
|
|
306
|
-
cmd = f'{python_cmd} -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
|
|
306
|
+
cmd = f'{python_cmd} -u -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
|
|
307
307
|
|
|
308
308
|
# Use environment as-is (already configured by .pirc)
|
|
309
309
|
env = os.environ.copy()
|
|
@@ -351,6 +351,37 @@ class VLLMManager:
|
|
|
351
351
|
except:
|
|
352
352
|
pass
|
|
353
353
|
|
|
354
|
+
# Force kill all vLLM-related Python processes to ensure cleanup
|
|
355
|
+
max_attempts = 5
|
|
356
|
+
for attempt in range(max_attempts):
|
|
357
|
+
try:
|
|
358
|
+
# Get all python processes containing 'vllm'
|
|
359
|
+
ps_result = sp.run(['ps', 'aux'], capture_output=True, text=True)
|
|
360
|
+
vllm_pids = []
|
|
361
|
+
|
|
362
|
+
for line in ps_result.stdout.split('\n'):
|
|
363
|
+
if 'python' in line and 'vllm' in line and 'vllm_manager.py' not in line:
|
|
364
|
+
# Extract PID (second column)
|
|
365
|
+
parts = line.split()
|
|
366
|
+
if len(parts) > 1:
|
|
367
|
+
vllm_pids.append(parts[1])
|
|
368
|
+
|
|
369
|
+
if not vllm_pids:
|
|
370
|
+
break # No vLLM processes found
|
|
371
|
+
|
|
372
|
+
# Kill the vLLM processes
|
|
373
|
+
for pid in vllm_pids:
|
|
374
|
+
try:
|
|
375
|
+
sp.run(['kill', '-9', pid], capture_output=True)
|
|
376
|
+
except:
|
|
377
|
+
pass
|
|
378
|
+
|
|
379
|
+
# Small delay between attempts
|
|
380
|
+
import time
|
|
381
|
+
time.sleep(0.5)
|
|
382
|
+
except:
|
|
383
|
+
break
|
|
384
|
+
|
|
354
385
|
del self.models[name]
|
|
355
386
|
self.save()
|
|
356
387
|
return True
|
|
@@ -367,6 +398,92 @@ class VLLMManager:
|
|
|
367
398
|
with open(log_file, 'r') as f:
|
|
368
399
|
all_lines = f.readlines()
|
|
369
400
|
return ''.join(all_lines[-lines:])
|
|
401
|
+
|
|
402
|
+
def check_downloads(self):
|
|
403
|
+
"""Check model download progress in HuggingFace cache"""
|
|
404
|
+
import glob
|
|
405
|
+
import re
|
|
406
|
+
|
|
407
|
+
# Respect HuggingFace environment variables
|
|
408
|
+
if os.environ.get('HUGGINGFACE_HUB_CACHE'):
|
|
409
|
+
cache_dir = Path(os.environ['HUGGINGFACE_HUB_CACHE'])
|
|
410
|
+
elif os.environ.get('HF_HOME'):
|
|
411
|
+
cache_dir = Path(os.environ['HF_HOME']) / "hub"
|
|
412
|
+
else:
|
|
413
|
+
cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
|
|
414
|
+
|
|
415
|
+
if not cache_dir.exists():
|
|
416
|
+
return {"status": "NO_CACHE", "cache_dir": str(cache_dir)}
|
|
417
|
+
|
|
418
|
+
model_dirs = list(cache_dir.glob("models--*"))
|
|
419
|
+
if not model_dirs:
|
|
420
|
+
return {"status": "NO_MODELS"}
|
|
421
|
+
|
|
422
|
+
results = []
|
|
423
|
+
|
|
424
|
+
for model_dir in model_dirs:
|
|
425
|
+
# Extract model name
|
|
426
|
+
model_name = model_dir.name.replace("models--", "").replace("--", "/")
|
|
427
|
+
|
|
428
|
+
# Get size (only count actual blob files, not symlinks)
|
|
429
|
+
total_size = 0
|
|
430
|
+
blobs_dir = model_dir / "blobs"
|
|
431
|
+
if blobs_dir.exists():
|
|
432
|
+
for f in blobs_dir.iterdir():
|
|
433
|
+
if f.is_file() and not f.name.endswith('.incomplete'):
|
|
434
|
+
total_size += f.stat().st_size
|
|
435
|
+
size_gb = total_size / (1024**3)
|
|
436
|
+
|
|
437
|
+
# Count safetensors files in blobs directory (actual files)
|
|
438
|
+
safetensors_count = 0
|
|
439
|
+
snapshots_dir = model_dir / "snapshots"
|
|
440
|
+
if snapshots_dir.exists():
|
|
441
|
+
for snapshot in snapshots_dir.iterdir():
|
|
442
|
+
if snapshot.is_dir():
|
|
443
|
+
safetensors_count = len(list(snapshot.glob("*.safetensors")))
|
|
444
|
+
break # Use first snapshot
|
|
445
|
+
file_count = safetensors_count
|
|
446
|
+
|
|
447
|
+
# Get total expected files from filename pattern
|
|
448
|
+
total_files = 0
|
|
449
|
+
if snapshots_dir.exists():
|
|
450
|
+
for snapshot in snapshots_dir.iterdir():
|
|
451
|
+
if snapshot.is_dir():
|
|
452
|
+
for f in snapshot.glob("*.safetensors"):
|
|
453
|
+
match = re.search(r'model-\d+-of-(\d+)\.safetensors', f.name)
|
|
454
|
+
if match:
|
|
455
|
+
total_files = max(total_files, int(match.group(1)))
|
|
456
|
+
break # Only check first snapshot
|
|
457
|
+
|
|
458
|
+
# Check if actively downloading (check if any incomplete files exist in blobs)
|
|
459
|
+
incomplete_files = []
|
|
460
|
+
if blobs_dir.exists():
|
|
461
|
+
incomplete_files = list(blobs_dir.glob("*.incomplete"))
|
|
462
|
+
is_active = len(incomplete_files) > 0
|
|
463
|
+
|
|
464
|
+
results.append({
|
|
465
|
+
"model": model_name,
|
|
466
|
+
"size_gb": round(size_gb, 1),
|
|
467
|
+
"files": file_count,
|
|
468
|
+
"total_files": total_files,
|
|
469
|
+
"active": is_active
|
|
470
|
+
})
|
|
471
|
+
|
|
472
|
+
# Count vLLM processes
|
|
473
|
+
vllm_count = 0
|
|
474
|
+
for proc in psutil.process_iter(['pid', 'cmdline']):
|
|
475
|
+
try:
|
|
476
|
+
cmdline = ' '.join(proc.info['cmdline'] or [])
|
|
477
|
+
if 'python' in cmdline and 'vllm' in cmdline and 'vllm_manager.py' not in cmdline:
|
|
478
|
+
vllm_count += 1
|
|
479
|
+
except:
|
|
480
|
+
pass
|
|
481
|
+
|
|
482
|
+
return {
|
|
483
|
+
"status": "OK",
|
|
484
|
+
"models": results,
|
|
485
|
+
"vllm_processes": vllm_count
|
|
486
|
+
}
|
|
370
487
|
|
|
371
488
|
def main():
|
|
372
489
|
import sys
|
|
@@ -374,7 +491,7 @@ def main():
|
|
|
374
491
|
manager = VLLMManager()
|
|
375
492
|
|
|
376
493
|
if len(sys.argv) < 2:
|
|
377
|
-
print("Usage: vllm_manager.py [list|start|stop|logs] ...")
|
|
494
|
+
print("Usage: vllm_manager.py [list|start|stop|logs|downloads] ...")
|
|
378
495
|
sys.exit(1)
|
|
379
496
|
|
|
380
497
|
cmd = sys.argv[1]
|
|
@@ -405,8 +522,12 @@ def main():
|
|
|
405
522
|
elif 'gpu_id' in info and info['gpu_id'] is not None:
|
|
406
523
|
print(f" GPU: {info['gpu_id']}")
|
|
407
524
|
print(f" URL: http://{host_ip}:{info['port']}/v1")
|
|
525
|
+
print(f"\n Export for OpenAI clients:")
|
|
526
|
+
print(f" export OPENAI_BASE_URL='http://{host_ip}:{info['port']}/v1'")
|
|
527
|
+
print(f" export OPENAI_API_KEY='dummy'")
|
|
528
|
+
print(f" export OPENAI_MODEL='{info['model_id']}'")
|
|
408
529
|
if 'log_file' in info:
|
|
409
|
-
print(f" Logs: {info['log_file']}")
|
|
530
|
+
print(f"\n Logs: {info['log_file']}")
|
|
410
531
|
|
|
411
532
|
elif cmd == "start":
|
|
412
533
|
if len(sys.argv) < 3:
|
|
@@ -435,6 +556,7 @@ def main():
|
|
|
435
556
|
print(f"URL: http://{host_ip}:{model_result['port']}/v1")
|
|
436
557
|
print(f"\nExport for OpenAI clients:")
|
|
437
558
|
print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
|
|
559
|
+
print(f"export OPENAI_MODEL='{model_id}'")
|
|
438
560
|
|
|
439
561
|
elif cmd == "start_raw":
|
|
440
562
|
if len(sys.argv) < 5:
|
|
@@ -465,6 +587,7 @@ def main():
|
|
|
465
587
|
print(f"URL: http://{host_ip}:{model_result['port']}/v1")
|
|
466
588
|
print(f"\nExport for OpenAI clients:")
|
|
467
589
|
print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
|
|
590
|
+
print(f"export OPENAI_MODEL='{model_id}'")
|
|
468
591
|
|
|
469
592
|
elif cmd == "stop":
|
|
470
593
|
if len(sys.argv) < 3:
|
|
@@ -491,6 +614,46 @@ def main():
|
|
|
491
614
|
else:
|
|
492
615
|
print(logs, end='')
|
|
493
616
|
|
|
617
|
+
elif cmd == "downloads":
|
|
618
|
+
# Check if --stream flag is provided
|
|
619
|
+
stream = len(sys.argv) > 2 and sys.argv[2] == "--stream"
|
|
620
|
+
|
|
621
|
+
if stream:
|
|
622
|
+
# Streaming mode - continuously output status
|
|
623
|
+
import time
|
|
624
|
+
import signal
|
|
625
|
+
|
|
626
|
+
# Handle SIGTERM/SIGINT for clean shutdown
|
|
627
|
+
def signal_handler(sig, frame):
|
|
628
|
+
sys.exit(0)
|
|
629
|
+
|
|
630
|
+
signal.signal(signal.SIGINT, signal_handler)
|
|
631
|
+
signal.signal(signal.SIGTERM, signal_handler)
|
|
632
|
+
|
|
633
|
+
while True:
|
|
634
|
+
download_info = manager.check_downloads()
|
|
635
|
+
|
|
636
|
+
if download_info["status"] == "NO_CACHE":
|
|
637
|
+
print(json.dumps({"status": "NO_CACHE", "message": "No HuggingFace cache found"}))
|
|
638
|
+
elif download_info["status"] == "NO_MODELS":
|
|
639
|
+
print(json.dumps({"status": "NO_MODELS", "message": "No models in cache"}))
|
|
640
|
+
else:
|
|
641
|
+
print(json.dumps(download_info))
|
|
642
|
+
|
|
643
|
+
sys.stdout.flush() # Force flush to ensure output is sent
|
|
644
|
+
time.sleep(2) # Update every 2 seconds
|
|
645
|
+
else:
|
|
646
|
+
# Single check mode
|
|
647
|
+
download_info = manager.check_downloads()
|
|
648
|
+
|
|
649
|
+
if download_info["status"] == "NO_CACHE":
|
|
650
|
+
print("No HuggingFace cache found")
|
|
651
|
+
elif download_info["status"] == "NO_MODELS":
|
|
652
|
+
print("No models in cache")
|
|
653
|
+
else:
|
|
654
|
+
# Output as JSON for easy parsing
|
|
655
|
+
print(json.dumps(download_info))
|
|
656
|
+
|
|
494
657
|
else:
|
|
495
658
|
print(f"Unknown command: {cmd}")
|
|
496
659
|
sys.exit(1)
|