npm - @mariozechner/pi - Versions diffs - 0.1.5 → 0.2.4 - Mend

@mariozechner/pi 0.1.5 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/pod_setup.sh CHANGED Viewed

@@ -1,133 +1,74 @@
-#!/bin/bash
-# Pod setup script for  GPU instances. Assumes Ubuntu based system with CUDA drivers installed.
+#!/usr/bin/env bash
+# GPU pod bootstrap: Ubuntu 22.04 + CUDA 12.6/12.8, vLLM latest, FlashInfer w/ TRT kernels (sm70-120)
-set -e
+set -euo pipefail
-echo "=== Pod Setup ==="
+apt update -y
+apt install -y python3-pip python3-venv git build-essential cmake ninja-build curl
-# Update and install basics
-sudo apt update
-sudo apt install -y python3-pip python3-venv
+# --- Install uv (fast Python package manager) --------------------------------
+curl -LsSf https://astral.sh/uv/install.sh | sh
+export PATH="$HOME/.local/bin:$PATH"
-# Create virtual environment for vLLM
-VENV_PATH="$HOME/vllm_env"
-echo "Creating virtual environment at $VENV_PATH..."
-python3 -m venv "$VENV_PATH"
+# --- Create and activate venv ------------------------------------------------
+VENV="$HOME/vllm_env"
+uv venv --python 3.12 --seed "$VENV"
+source "$VENV/bin/activate"
-# Activate virtual environment
-source "$VENV_PATH/bin/activate"
+# --- Install vLLM with automatic PyTorch selection ---------------------------
+echo "Installing vLLM with automatic CUDA/PyTorch detection..."
+# uv automatically selects the right PyTorch based on CUDA version
+uv pip install vllm --torch-backend=auto
-# Upgrade pip in virtual environment
-pip install --upgrade pip
+# --- Install additional packages ---------------------------------------------
+echo "Installing additional packages..."
+uv pip install huggingface-hub psutil tensorrt hf_transfer
-# Install vLLM and dependencies
-echo "Installing vLLM and dependencies..."
+# --- FlashInfer installation (optional, improves performance) ----------------
+echo "Attempting FlashInfer installation (optional)..."
+# vLLM will use Flash Attention as fallback if FlashInfer is not available
-# Detect CUDA version and install appropriate PyTorch
-# First try nvidia-smi (more commonly available), then nvcc
-if command -v nvidia-smi &> /dev/null; then
-    CUDA_VERSION=$(nvidia-smi | grep -oP 'CUDA Version: \K[0-9]+\.[0-9]+' | head -1)
-    echo "Detected CUDA version from nvidia-smi: $CUDA_VERSION"
-elif command -v nvcc &> /dev/null; then
-    CUDA_VERSION=$(nvcc --version | grep "release" | sed -n 's/.*release \([0-9]\+\.[0-9]\+\).*/\1/p')
-    echo "Detected CUDA version from nvcc: $CUDA_VERSION"
+# Try the official FlashInfer package name
+if uv pip install flashinfer-python; then
+    echo "FlashInfer installed successfully"
+    ATTENTION_BACKEND="FLASHINFER"
 else
-    CUDA_VERSION=""
+    echo "FlashInfer not available, using Flash Attention instead"
+    ATTENTION_BACKEND="FLASH_ATTN"
 fi
-if [ -n "$CUDA_VERSION" ]; then
-    # Map CUDA version to PyTorch index
-    case "$CUDA_VERSION" in
-        12.8*)
-            echo "Installing PyTorch with CUDA 12.8 support"
-            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
-            ;;
-        12.7*)
-            echo "Installing PyTorch with CUDA 12.7 support"
-            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu127
-            ;;
-        12.6*)
-            echo "Installing PyTorch with CUDA 12.6 support"
-            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
-            ;;
-        12.4*)
-            echo "Installing PyTorch with CUDA 12.4 support"
-            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
-            ;;
-        12.1*)
-            echo "Installing PyTorch with CUDA 12.1 support"
-            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
-            ;;
-        11.8*)
-            echo "Installing PyTorch with CUDA 11.8 support"
-            pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
-            ;;
-        *)
-            echo "CUDA $CUDA_VERSION detected - using default PyTorch (may not be optimal)"
-            pip install torch torchvision torchaudio
-            ;;
-    esac
-else
-    echo "WARNING: nvcc not found, installing default PyTorch"
-    pip install torch torchvision torchaudio
-fi
-pip install vllm huggingface-hub psutil
-# Install FlashInfer for better performance (~15% sampler latency reduction)
-echo "Installing FlashInfer for performance optimization..."
-echo "Building FlashInfer from source..."
-# Clone and build FlashInfer from source
-cd /tmp
-if [ -d "flashinfer" ]; then
-    rm -rf flashinfer
-fi
-git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-cd flashinfer
-# Install from source
-if python -m pip install -v .; then
-    echo "FlashInfer successfully built from source"
-else
-    echo "FlashInfer installation failed (optional)"
-fi
+# --- HF token check ----------------------------------------------------------
+: "${HF_TOKEN:?HF_TOKEN env var required}"
-# Clean up
-cd /
-rm -rf /tmp/flashinfer
+mkdir -p ~/.config/vllm
+touch ~/.config/vllm/do_not_track
-# Setup HuggingFace token from environment
-if [ -z "$HF_TOKEN" ]; then
-    echo "ERROR: HF_TOKEN environment variable not set"
-    echo "Please export HF_TOKEN before running setup"
-    exit 1
-fi
-# Create directory for vLLM config
-mkdir -p ~/.config/vllm && touch ~/.config/vllm/do_not_track
-# Create .pirc file for consistent environment
-cat > ~/.pirc << EOF
-# Prime Intellect CLI environment
-# This file is sourced by all pi commands
-# Activate vLLM virtual environment if it exists
-if [ -d "\$HOME/vllm_env" ]; then
-    source "\$HOME/vllm_env/bin/activate"
-fi
-# Performance optimizations
+cat > ~/.pirc <<EOF
+# auto-sourced env
+[ -d "$HOME/vllm_env" ] && source "$HOME/vllm_env/bin/activate"
+export PATH="$HOME/.local/bin:$PATH"
+export VLLM_ATTENTION_BACKEND=${ATTENTION_BACKEND}
 export VLLM_USE_FLASHINFER_SAMPLER=1
 export VLLM_USE_DEEP_GEMM=1
 export VLLM_NO_USAGE_STATS=1
 export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1
-# HuggingFace tokens
-export HF_TOKEN="$HF_TOKEN"
-export HUGGING_FACE_HUB_TOKEN="$HF_TOKEN"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export HF_TOKEN=${HF_TOKEN}
+export HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
+export HF_HUB_ENABLE_HF_TRANSFER=1
 EOF
-# Copy manager script
-echo "Setup complete!"
+# --- RunPod specific setup ---------------------------------------------------
+if df -h | grep -q "runpod.net.*workspace"; then
+    echo "Detected RunPod instance - setting up workspace symlink..."
+    if [ ! -L ~/.cache/huggingface ]; then
+        mkdir -p /workspace/cache/huggingface
+        rm -rf ~/.cache/huggingface 2>/dev/null || true
+        ln -s /workspace/cache/huggingface ~/.cache/huggingface
+        echo "Created symlink: ~/.cache/huggingface -> /workspace/cache/huggingface"
+    else
+        echo "Symlink already exists"
+    fi
+fi
+echo "=== DONE ==="

package/vllm_manager.py CHANGED Viewed

@@ -197,7 +197,7 @@ class VLLMManager:
         # Start vLLM (use venv python if available)
         python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
         cmd = [
-            python_cmd, "-m", "vllm.entrypoints.openai.api_server",
+            python_cmd, "-u", "-m", "vllm.entrypoints.openai.api_server",
             "--model", model_id,
             "--host", "0.0.0.0",
             "--port", str(port),
@@ -303,7 +303,7 @@ class VLLMManager:
         python_cmd = str(Path.home() / "vllm_env/bin/python3") if (Path.home() / "vllm_env/bin/python3").exists() else "python3"
         # Base command - ensure vllm_args is properly quoted
-        cmd = f'{python_cmd} -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
+        cmd = f'{python_cmd} -u -m vllm.entrypoints.openai.api_server --model "{model_id}" --host 0.0.0.0 --port {port} {vllm_args}'
         # Use environment as-is (already configured by .pirc)
         env = os.environ.copy()
@@ -351,6 +351,37 @@ class VLLMManager:
         except:
             pass
+        # Force kill all vLLM-related Python processes to ensure cleanup
+        max_attempts = 5
+        for attempt in range(max_attempts):
+            try:
+                # Get all python processes containing 'vllm'
+                ps_result = sp.run(['ps', 'aux'], capture_output=True, text=True)
+                vllm_pids = []
+                for line in ps_result.stdout.split('\n'):
+                    if 'python' in line and 'vllm' in line and 'vllm_manager.py' not in line:
+                        # Extract PID (second column)
+                        parts = line.split()
+                        if len(parts) > 1:
+                            vllm_pids.append(parts[1])
+                if not vllm_pids:
+                    break  # No vLLM processes found
+                # Kill the vLLM processes
+                for pid in vllm_pids:
+                    try:
+                        sp.run(['kill', '-9', pid], capture_output=True)
+                    except:
+                        pass
+                # Small delay between attempts
+                import time
+                time.sleep(0.5)
+            except:
+                break
         del self.models[name]
         self.save()
         return True
@@ -367,6 +398,92 @@ class VLLMManager:
         with open(log_file, 'r') as f:
             all_lines = f.readlines()
             return ''.join(all_lines[-lines:])
+    def check_downloads(self):
+        """Check model download progress in HuggingFace cache"""
+        import glob
+        import re
+        # Respect HuggingFace environment variables
+        if os.environ.get('HUGGINGFACE_HUB_CACHE'):
+            cache_dir = Path(os.environ['HUGGINGFACE_HUB_CACHE'])
+        elif os.environ.get('HF_HOME'):
+            cache_dir = Path(os.environ['HF_HOME']) / "hub"
+        else:
+            cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
+        if not cache_dir.exists():
+            return {"status": "NO_CACHE", "cache_dir": str(cache_dir)}
+        model_dirs = list(cache_dir.glob("models--*"))
+        if not model_dirs:
+            return {"status": "NO_MODELS"}
+        results = []
+        for model_dir in model_dirs:
+            # Extract model name
+            model_name = model_dir.name.replace("models--", "").replace("--", "/")
+            # Get size (only count actual blob files, not symlinks)
+            total_size = 0
+            blobs_dir = model_dir / "blobs"
+            if blobs_dir.exists():
+                for f in blobs_dir.iterdir():
+                    if f.is_file() and not f.name.endswith('.incomplete'):
+                        total_size += f.stat().st_size
+            size_gb = total_size / (1024**3)
+            # Count safetensors files in blobs directory (actual files)
+            safetensors_count = 0
+            snapshots_dir = model_dir / "snapshots"
+            if snapshots_dir.exists():
+                for snapshot in snapshots_dir.iterdir():
+                    if snapshot.is_dir():
+                        safetensors_count = len(list(snapshot.glob("*.safetensors")))
+                        break  # Use first snapshot
+            file_count = safetensors_count
+            # Get total expected files from filename pattern
+            total_files = 0
+            if snapshots_dir.exists():
+                for snapshot in snapshots_dir.iterdir():
+                    if snapshot.is_dir():
+                        for f in snapshot.glob("*.safetensors"):
+                            match = re.search(r'model-\d+-of-(\d+)\.safetensors', f.name)
+                            if match:
+                                total_files = max(total_files, int(match.group(1)))
+                        break  # Only check first snapshot
+            # Check if actively downloading (check if any incomplete files exist in blobs)
+            incomplete_files = []
+            if blobs_dir.exists():
+                incomplete_files = list(blobs_dir.glob("*.incomplete"))
+            is_active = len(incomplete_files) > 0
+            results.append({
+                "model": model_name,
+                "size_gb": round(size_gb, 1),
+                "files": file_count,
+                "total_files": total_files,
+                "active": is_active
+            })
+        # Count vLLM processes
+        vllm_count = 0
+        for proc in psutil.process_iter(['pid', 'cmdline']):
+            try:
+                cmdline = ' '.join(proc.info['cmdline'] or [])
+                if 'python' in cmdline and 'vllm' in cmdline and 'vllm_manager.py' not in cmdline:
+                    vllm_count += 1
+            except:
+                pass
+        return {
+            "status": "OK",
+            "models": results,
+            "vllm_processes": vllm_count
+        }
 def main():
     import sys
@@ -374,7 +491,7 @@ def main():
     manager = VLLMManager()
     if len(sys.argv) < 2:
-        print("Usage: vllm_manager.py [list|start|stop|logs] ...")
+        print("Usage: vllm_manager.py [list|start|stop|logs|downloads] ...")
         sys.exit(1)
     cmd = sys.argv[1]
@@ -405,8 +522,12 @@ def main():
                 elif 'gpu_id' in info and info['gpu_id'] is not None:
                     print(f"  GPU:   {info['gpu_id']}")
                 print(f"  URL:   http://{host_ip}:{info['port']}/v1")
+                print(f"\n  Export for OpenAI clients:")
+                print(f"  export OPENAI_BASE_URL='http://{host_ip}:{info['port']}/v1'")
+                print(f"  export OPENAI_API_KEY='dummy'")
+                print(f"  export OPENAI_MODEL='{info['model_id']}'")
                 if 'log_file' in info:
-                    print(f"  Logs:  {info['log_file']}")
+                    print(f"\n  Logs:  {info['log_file']}")
     elif cmd == "start":
         if len(sys.argv) < 3:
@@ -435,6 +556,7 @@ def main():
         print(f"URL: http://{host_ip}:{model_result['port']}/v1")
         print(f"\nExport for OpenAI clients:")
         print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
+        print(f"export OPENAI_MODEL='{model_id}'")
     elif cmd == "start_raw":
         if len(sys.argv) < 5:
@@ -465,6 +587,7 @@ def main():
         print(f"URL: http://{host_ip}:{model_result['port']}/v1")
         print(f"\nExport for OpenAI clients:")
         print(f"export OPENAI_BASE_URL='http://{host_ip}:{model_result['port']}/v1'")
+        print(f"export OPENAI_MODEL='{model_id}'")
     elif cmd == "stop":
         if len(sys.argv) < 3:
@@ -491,6 +614,46 @@ def main():
         else:
             print(logs, end='')
+    elif cmd == "downloads":
+        # Check if --stream flag is provided
+        stream = len(sys.argv) > 2 and sys.argv[2] == "--stream"
+        if stream:
+            # Streaming mode - continuously output status
+            import time
+            import signal
+            # Handle SIGTERM/SIGINT for clean shutdown
+            def signal_handler(sig, frame):
+                sys.exit(0)
+            signal.signal(signal.SIGINT, signal_handler)
+            signal.signal(signal.SIGTERM, signal_handler)
+            while True:
+                download_info = manager.check_downloads()
+                if download_info["status"] == "NO_CACHE":
+                    print(json.dumps({"status": "NO_CACHE", "message": "No HuggingFace cache found"}))
+                elif download_info["status"] == "NO_MODELS":
+                    print(json.dumps({"status": "NO_MODELS", "message": "No models in cache"}))
+                else:
+                    print(json.dumps(download_info))
+                sys.stdout.flush()  # Force flush to ensure output is sent
+                time.sleep(2)  # Update every 2 seconds
+        else:
+            # Single check mode
+            download_info = manager.check_downloads()
+            if download_info["status"] == "NO_CACHE":
+                print("No HuggingFace cache found")
+            elif download_info["status"] == "NO_MODELS":
+                print("No models in cache")
+            else:
+                # Output as JSON for easy parsing
+                print(json.dumps(download_info))
     else:
         print(f"Unknown command: {cmd}")
         sys.exit(1)