@mariozechner/pi 0.1.5 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/pod_setup.sh CHANGED
@@ -1,133 +1,74 @@
1
- #!/bin/bash
2
- # Pod setup script for GPU instances. Assumes Ubuntu based system with CUDA drivers installed.
1
+ #!/usr/bin/env bash
2
+ # GPU pod bootstrap: Ubuntu 22.04 + CUDA 12.6/12.8, vLLM latest, FlashInfer w/ TRT kernels (sm70-120)
3
3
 
4
- set -e
4
+ set -euo pipefail
5
5
 
6
- echo "=== Pod Setup ==="
6
+ apt update -y
7
+ apt install -y python3-pip python3-venv git build-essential cmake ninja-build curl
7
8
 
8
- # Update and install basics
9
- sudo apt update
10
- sudo apt install -y python3-pip python3-venv
9
+ # --- Install uv (fast Python package manager) --------------------------------
10
+ curl -LsSf https://astral.sh/uv/install.sh | sh
11
+ export PATH="$HOME/.local/bin:$PATH"
11
12
 
12
- # Create virtual environment for vLLM
13
- VENV_PATH="$HOME/vllm_env"
14
- echo "Creating virtual environment at $VENV_PATH..."
15
- python3 -m venv "$VENV_PATH"
13
+ # --- Create and activate venv ------------------------------------------------
14
+ VENV="$HOME/vllm_env"
15
+ uv venv --python 3.12 --seed "$VENV"
16
+ source "$VENV/bin/activate"
16
17
 
17
- # Activate virtual environment
18
- source "$VENV_PATH/bin/activate"
18
+ # --- Install vLLM with automatic PyTorch selection ---------------------------
19
+ echo "Installing vLLM with automatic CUDA/PyTorch detection..."
20
+ # uv automatically selects the right PyTorch based on CUDA version
21
+ uv pip install vllm --torch-backend=auto
19
22
 
20
- # Upgrade pip in virtual environment
21
- pip install --upgrade pip
23
+ # --- Install additional packages ---------------------------------------------
24
+ echo "Installing additional packages..."
25
+ uv pip install huggingface-hub psutil tensorrt hf_transfer
22
26
 
23
- # Install vLLM and dependencies
24
- echo "Installing vLLM and dependencies..."
27
+ # --- FlashInfer installation (optional, improves performance) ----------------
28
+ echo "Attempting FlashInfer installation (optional)..."
29
+ # vLLM will use Flash Attention as fallback if FlashInfer is not available
25
30
 
26
- # Detect CUDA version and install appropriate PyTorch
27
- # First try nvidia-smi (more commonly available), then nvcc
28
- if command -v nvidia-smi &> /dev/null; then
29
- CUDA_VERSION=$(nvidia-smi | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+' | head -1)
30
- echo "Detected CUDA version from nvidia-smi: $CUDA_VERSION"
31
- elif command -v nvcc &> /dev/null; then
32
- CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
33
- echo "Detected CUDA version from nvcc: $CUDA_VERSION"
31
+ # Try the official FlashInfer package name
32
+ if uv pip install flashinfer-python; then
33
+ echo "FlashInfer installed successfully"
34
+ ATTENTION_BACKEND="FLASHINFER"
34
35
  else
35
- CUDA_VERSION=""
36
+ echo "FlashInfer not available, using Flash Attention instead"
37
+ ATTENTION_BACKEND="FLASH_ATTN"
36
38
  fi
37
39
 
38
- if [ -n "$CUDA_VERSION" ]; then
39
- # Map CUDA version to PyTorch index
40
- case "$CUDA_VERSION" in
41
- 12.8*)
42
- echo "Installing PyTorch with CUDA 12.8 support"
43
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
44
- ;;
45
- 12.7*)
46
- echo "Installing PyTorch with CUDA 12.7 support"
47
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu127
48
- ;;
49
- 12.6*)
50
- echo "Installing PyTorch with CUDA 12.6 support"
51
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
52
- ;;
53
- 12.4*)
54
- echo "Installing PyTorch with CUDA 12.4 support"
55
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
56
- ;;
57
- 12.1*)
58
- echo "Installing PyTorch with CUDA 12.1 support"
59
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
60
- ;;
61
- 11.8*)
62
- echo "Installing PyTorch with CUDA 11.8 support"
63
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
64
- ;;
65
- *)
66
- echo "CUDA $CUDA_VERSION detected - using default PyTorch (may not be optimal)"
67
- pip install torch torchvision torchaudio
68
- ;;
69
- esac
70
- else
71
- echo "WARNING: nvcc not found, installing default PyTorch"
72
- pip install torch torchvision torchaudio
73
- fi
74
-
75
- pip install vllm huggingface-hub psutil
76
-
77
- # Install FlashInfer for better performance (~15% sampler latency reduction)
78
- echo "Installing FlashInfer for performance optimization..."
79
- echo "Building FlashInfer from source..."
80
-
81
- # Clone and build FlashInfer from source
82
- cd /tmp
83
- if [ -d "flashinfer" ]; then
84
- rm -rf flashinfer
85
- fi
86
-
87
- git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
88
- cd flashinfer
89
-
90
- # Install from source
91
- if python -m pip install -v .; then
92
- echo "FlashInfer successfully built from source"
93
- else
94
- echo "FlashInfer installation failed (optional)"
95
- fi
40
+ # --- HF token check ----------------------------------------------------------
41
+ : "${HF_TOKEN:?HF_TOKEN env var required}"
96
42
 
97
- # Clean up
98
- cd /
99
- rm -rf /tmp/flashinfer
43
+ mkdir -p ~/.config/vllm
44
+ touch ~/.config/vllm/do_not_track
100
45
 
101
- # Setup HuggingFace token from environment
102
- if [ -z "$HF_TOKEN" ]; then
103
- echo "ERROR: HF_TOKEN environment variable not set"
104
- echo "Please export HF_TOKEN before running setup"
105
- exit 1
106
- fi
107
-
108
- # Create directory for vLLM config
109
- mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
110
-
111
- # Create .pirc file for consistent environment
112
- cat > ~/.pirc << EOF
113
- # Prime Intellect CLI environment
114
- # This file is sourced by all pi commands
115
-
116
- # Activate vLLM virtual environment if it exists
117
- if [ -d "\$HOME/vllm_env" ]; then
118
- source "\$HOME/vllm_env/bin/activate"
119
- fi
120
-
121
- # Performance optimizations
46
+ cat > ~/.pirc <<EOF
47
+ # auto-sourced env
48
+ [ -d "$HOME/vllm_env" ] && source "$HOME/vllm_env/bin/activate"
49
+ export PATH="$HOME/.local/bin:$PATH"
50
+ export VLLM_ATTENTION_BACKEND=${ATTENTION_BACKEND}
122
51
  export VLLM_USE_FLASHINFER_SAMPLER=1
123
52
  export VLLM_USE_DEEP_GEMM=1
124
53
  export VLLM_NO_USAGE_STATS=1
125
54
  export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
126
-
127
- # HuggingFace tokens
128
- export HF_TOKEN="$HF_TOKEN"
129
- export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
55
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
56
+ export HF_TOKEN=${HF_TOKEN}
57
+ export HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
58
+ export HF_HUB_ENABLE_HF_TRANSFER=1
130
59
  EOF
131
60
 
132
- # Copy manager script
133
- echo "Setup complete!"
61
+ # --- RunPod specific setup ---------------------------------------------------
62
+ if df -h | grep -q "runpod.net.*workspace"; then
63
+ echo "Detected RunPod instance - setting up workspace symlink..."
64
+ if [ ! -L ~/.cache/huggingface ]; then
65
+ mkdir -p /workspace/cache/huggingface
66
+ rm -rf ~/.cache/huggingface 2>/dev/null || true
67
+ ln -s /workspace/cache/huggingface ~/.cache/huggingface
68
+ echo "Created symlink: ~/.cache/huggingface -> /workspace/cache/huggingface"
69
+ else
70
+ echo "Symlink already exists"
71
+ fi
72
+ fi
73
+
74
+ echo "=== DONE ==="
package/vllm_manager.py CHANGED
@@ -197,7 +197,7 @@ class VLLMManager:
197
197
  # Start vLLM (use venv python if available)
198
198
  python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
199
199
  cmd = [
200
- python_cmd, "-m", "vllm.entrypoints.openai.api_server",
200
+ python_cmd, "-u", "-m", "vllm.entrypoints.openai.api_server",
201
201
  "--model", model_id,
202
202
  "--host", "0.0.0.0",
203
203
  "--port", str(port),
@@ -303,7 +303,7 @@ class VLLMManager:
303
303
  python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
304
304
 
305
305
  # Base command - ensure vllm_args is properly quoted
306
- cmd = f'{python_cmd} -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
306
+ cmd = f'{python_cmd} -u -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
307
307
 
308
308
  # Use environment as-is (already configured by .pirc)
309
309
  env = os.environ.copy()
@@ -351,6 +351,37 @@ class VLLMManager:
351
351
  except:
352
352
  pass
353
353
 
354
+ # Force kill all vLLM-related Python processes to ensure cleanup
355
+ max_attempts = 5
356
+ for attempt in range(max_attempts):
357
+ try:
358
+ # Get all python processes containing 'vllm'
359
+ ps_result = sp.run(['ps', 'aux'], capture_output=True, text=True)
360
+ vllm_pids = []
361
+
362
+ for line in ps_result.stdout.split('\n'):
363
+ if 'python' in line and 'vllm' in line and 'vllm_manager.py' not in line:
364
+ # Extract PID (second column)
365
+ parts = line.split()
366
+ if len(parts) > 1:
367
+ vllm_pids.append(parts[1])
368
+
369
+ if not vllm_pids:
370
+ break # No vLLM processes found
371
+
372
+ # Kill the vLLM processes
373
+ for pid in vllm_pids:
374
+ try:
375
+ sp.run(['kill', '-9', pid], capture_output=True)
376
+ except:
377
+ pass
378
+
379
+ # Small delay between attempts
380
+ import time
381
+ time.sleep(0.5)
382
+ except:
383
+ break
384
+
354
385
  del self.models[name]
355
386
  self.save()
356
387
  return True
@@ -367,6 +398,92 @@ class VLLMManager:
367
398
  with open(log_file, 'r') as f:
368
399
  all_lines = f.readlines()
369
400
  return ''.join(all_lines[-lines:])
401
+
402
+ def check_downloads(self):
403
+ """Check model download progress in HuggingFace cache"""
404
+ import glob
405
+ import re
406
+
407
+ # Respect HuggingFace environment variables
408
+ if os.environ.get('HUGGINGFACE_HUB_CACHE'):
409
+ cache_dir = Path(os.environ['HUGGINGFACE_HUB_CACHE'])
410
+ elif os.environ.get('HF_HOME'):
411
+ cache_dir = Path(os.environ['HF_HOME']) / "hub"
412
+ else:
413
+ cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
414
+
415
+ if not cache_dir.exists():
416
+ return {"status": "NO_CACHE", "cache_dir": str(cache_dir)}
417
+
418
+ model_dirs = list(cache_dir.glob("models--*"))
419
+ if not model_dirs:
420
+ return {"status": "NO_MODELS"}
421
+
422
+ results = []
423
+
424
+ for model_dir in model_dirs:
425
+ # Extract model name
426
+ model_name = model_dir.name.replace("models--", "").replace("--", "/")
427
+
428
+ # Get size (only count actual blob files, not symlinks)
429
+ total_size = 0
430
+ blobs_dir = model_dir / "blobs"
431
+ if blobs_dir.exists():
432
+ for f in blobs_dir.iterdir():
433
+ if f.is_file() and not f.name.endswith('.incomplete'):
434
+ total_size += f.stat().st_size
435
+ size_gb = total_size / (1024**3)
436
+
437
+ # Count safetensors files in blobs directory (actual files)
438
+ safetensors_count = 0
439
+ snapshots_dir = model_dir / "snapshots"
440
+ if snapshots_dir.exists():
441
+ for snapshot in snapshots_dir.iterdir():
442
+ if snapshot.is_dir():
443
+ safetensors_count = len(list(snapshot.glob("*.safetensors")))
444
+ break # Use first snapshot
445
+ file_count = safetensors_count
446
+
447
+ # Get total expected files from filename pattern
448
+ total_files = 0
449
+ if snapshots_dir.exists():
450
+ for snapshot in snapshots_dir.iterdir():
451
+ if snapshot.is_dir():
452
+ for f in snapshot.glob("*.safetensors"):
453
+ match = re.search(r'model-\d+-of-(\d+)\.safetensors', f.name)
454
+ if match:
455
+ total_files = max(total_files, int(match.group(1)))
456
+ break # Only check first snapshot
457
+
458
+ # Check if actively downloading (check if any incomplete files exist in blobs)
459
+ incomplete_files = []
460
+ if blobs_dir.exists():
461
+ incomplete_files = list(blobs_dir.glob("*.incomplete"))
462
+ is_active = len(incomplete_files) > 0
463
+
464
+ results.append({
465
+ "model": model_name,
466
+ "size_gb": round(size_gb, 1),
467
+ "files": file_count,
468
+ "total_files": total_files,
469
+ "active": is_active
470
+ })
471
+
472
+ # Count vLLM processes
473
+ vllm_count = 0
474
+ for proc in psutil.process_iter(['pid', 'cmdline']):
475
+ try:
476
+ cmdline = ' '.join(proc.info['cmdline'] or [])
477
+ if 'python' in cmdline and 'vllm' in cmdline and 'vllm_manager.py' not in cmdline:
478
+ vllm_count += 1
479
+ except:
480
+ pass
481
+
482
+ return {
483
+ "status": "OK",
484
+ "models": results,
485
+ "vllm_processes": vllm_count
486
+ }
370
487
 
371
488
  def main():
372
489
  import sys
@@ -374,7 +491,7 @@ def main():
374
491
  manager = VLLMManager()
375
492
 
376
493
  if len(sys.argv) < 2:
377
- print("Usage: vllm_manager.py [list|start|stop|logs] ...")
494
+ print("Usage: vllm_manager.py [list|start|stop|logs|downloads] ...")
378
495
  sys.exit(1)
379
496
 
380
497
  cmd = sys.argv[1]
@@ -405,8 +522,12 @@ def main():
405
522
  elif 'gpu_id' in info and info['gpu_id'] is not None:
406
523
  print(f" GPU: {info['gpu_id']}")
407
524
  print(f" URL: http://{host_ip}:{info['port']}/v1")
525
+ print(f"\n Export for OpenAI clients:")
526
+ print(f" export OPENAI_BASE_URL='http://{host_ip}:{info['port']}/v1'")
527
+ print(f" export OPENAI_API_KEY='dummy'")
528
+ print(f" export OPENAI_MODEL='{info['model_id']}'")
408
529
  if 'log_file' in info:
409
- print(f" Logs: {info['log_file']}")
530
+ print(f"\n Logs: {info['log_file']}")
410
531
 
411
532
  elif cmd == "start":
412
533
  if len(sys.argv) < 3:
@@ -435,6 +556,7 @@ def main():
435
556
  print(f"URL: http://{host_ip}:{model_result['port']}/v1")
436
557
  print(f"\nExport for OpenAI clients:")
437
558
  print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
559
+ print(f"export OPENAI_MODEL='{model_id}'")
438
560
 
439
561
  elif cmd == "start_raw":
440
562
  if len(sys.argv) < 5:
@@ -465,6 +587,7 @@ def main():
465
587
  print(f"URL: http://{host_ip}:{model_result['port']}/v1")
466
588
  print(f"\nExport for OpenAI clients:")
467
589
  print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
590
+ print(f"export OPENAI_MODEL='{model_id}'")
468
591
 
469
592
  elif cmd == "stop":
470
593
  if len(sys.argv) < 3:
@@ -491,6 +614,46 @@ def main():
491
614
  else:
492
615
  print(logs, end='')
493
616
 
617
+ elif cmd == "downloads":
618
+ # Check if --stream flag is provided
619
+ stream = len(sys.argv) > 2 and sys.argv[2] == "--stream"
620
+
621
+ if stream:
622
+ # Streaming mode - continuously output status
623
+ import time
624
+ import signal
625
+
626
+ # Handle SIGTERM/SIGINT for clean shutdown
627
+ def signal_handler(sig, frame):
628
+ sys.exit(0)
629
+
630
+ signal.signal(signal.SIGINT, signal_handler)
631
+ signal.signal(signal.SIGTERM, signal_handler)
632
+
633
+ while True:
634
+ download_info = manager.check_downloads()
635
+
636
+ if download_info["status"] == "NO_CACHE":
637
+ print(json.dumps({"status": "NO_CACHE", "message": "No HuggingFace cache found"}))
638
+ elif download_info["status"] == "NO_MODELS":
639
+ print(json.dumps({"status": "NO_MODELS", "message": "No models in cache"}))
640
+ else:
641
+ print(json.dumps(download_info))
642
+
643
+ sys.stdout.flush() # Force flush to ensure output is sent
644
+ time.sleep(2) # Update every 2 seconds
645
+ else:
646
+ # Single check mode
647
+ download_info = manager.check_downloads()
648
+
649
+ if download_info["status"] == "NO_CACHE":
650
+ print("No HuggingFace cache found")
651
+ elif download_info["status"] == "NO_MODELS":
652
+ print("No models in cache")
653
+ else:
654
+ # Output as JSON for easy parsing
655
+ print(json.dumps(download_info))
656
+
494
657
  else:
495
658
  print(f"Unknown command: {cmd}")
496
659
  sys.exit(1)