PyPI - pyllama-server - Versions diffs - 0.1.0__tar.gz - Mend

pyllama-server 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

pyllama_server-0.1.0/MANIFEST.in +8 -0
pyllama_server-0.1.0/PKG-INFO +223 -0
pyllama_server-0.1.0/README.md +176 -0
pyllama_server-0.1.0/pyllama/__init__.py +80 -0
pyllama_server-0.1.0/pyllama/benchmark.py +409 -0
pyllama_server-0.1.0/pyllama/bin/darwin/.gitkeep +0 -0
pyllama_server-0.1.0/pyllama/bin/linux/.gitkeep +0 -0
pyllama_server-0.1.0/pyllama/bin/windows/.gitkeep +0 -0
pyllama_server-0.1.0/pyllama/binaries.py +335 -0
pyllama_server-0.1.0/pyllama/cli.py +698 -0
pyllama_server-0.1.0/pyllama/client.py +174 -0
pyllama_server-0.1.0/pyllama/device.py +327 -0
pyllama_server-0.1.0/pyllama/models.py +549 -0
pyllama_server-0.1.0/pyllama/runner.py +316 -0
pyllama_server-0.1.0/pyllama/server.py +266 -0
pyllama_server-0.1.0/pyllama_server.egg-info/PKG-INFO +223 -0
pyllama_server-0.1.0/pyllama_server.egg-info/SOURCES.txt +21 -0
pyllama_server-0.1.0/pyllama_server.egg-info/dependency_links.txt +1 -0
pyllama_server-0.1.0/pyllama_server.egg-info/entry_points.txt +2 -0
pyllama_server-0.1.0/pyllama_server.egg-info/requires.txt +25 -0
pyllama_server-0.1.0/pyllama_server.egg-info/top_level.txt +1 -0
pyllama_server-0.1.0/pyproject.toml +83 -0
pyllama_server-0.1.0/setup.cfg +4 -0

pyllama_server-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,8 @@
+include README.md
+include pyproject.toml
+include LICENSE
+recursive-include pyllama *.py
+recursive-exclude pyllama/bin *
+include pyllama/bin/windows/.gitkeep
+include pyllama/bin/linux/.gitkeep
+include pyllama/bin/darwin/.gitkeep

pyllama_server-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,223 @@
+Metadata-Version: 2.4
+Name: pyllama-server
+Version: 0.1.0
+Summary: Python wrapper for llama.cpp server - OpenAI API compatible backend with auto device detection
+Author: llama.cpp community
+License: MIT
+Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
+Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
+Project-URL: Repository, https://github.com/ggml-org/llama.cpp
+Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
+Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: httpx>=0.24.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: rich>=13.0.0
+Requires-Dist: typer>=0.9.0
+Requires-Dist: psutil>=5.9.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: build>=1.0.0; extra == "dev"
+Requires-Dist: twine>=4.0.0; extra == "dev"
+Provides-Extra: openai
+Requires-Dist: openai>=1.0.0; extra == "openai"
+Provides-Extra: hf
+Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
+Provides-Extra: modelscope
+Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
+Provides-Extra: all
+Requires-Dist: openai>=1.0.0; extra == "all"
+Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
+Requires-Dist: modelscope>=1.10.0; extra == "all"
+# pyllama-server
+Python wrapper for llama.cpp server - OpenAI API compatible backend with automatic device detection and model downloading.
+## Features
+- **Auto Device Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
+- **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
+- **OpenAI API Compatible**: Drop-in replacement for OpenAI API
+- **Function Calling**: Support for tools and function calling
+- **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
+## Installation
+```bash
+pip install pyllama-server
+```
+## Quick Start
+### Command Line
+```bash
+# List available GPUs
+pyllama devices
+# Run inference with a model
+pyllama run ./model.gguf -p "Hello, world!"
+# Download a model
+pyllama download llama-3.2-3b -q Q4_K_M
+# Start an OpenAI-compatible server
+pyllama serve llama-3.2-3b -p 8080
+```
+### Python API
+```python
+from pyllama import quick_run, quick_server, Client
+# Quick inference
+result = quick_run("llama-3.2-3b", "Write a haiku about coding")
+print(result)
+# Start server with auto-configuration
+with quick_server("llama-3.2-3b") as server:
+    client = Client(server.base_url)
+    response = client.chat.completions.create(
+        model="llama",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+    print(response.choices[0]["message"]["content"])
+```
+### Auto Device Detection
+```python
+from pyllama import DeviceDetector, AutoRunner
+# Detect available GPUs
+detector = DeviceDetector()
+devices = detector.detect()
+for device in devices:
+    print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
+# Get optimal configuration
+config = detector.get_best_device(model_size_gb=5.0)
+print(f"Best device: {config.device.name}")
+print(f"Recommended GPU layers: {config.n_gpu_layers}")
+```
+### Model Download
+```python
+from pyllama import ModelDownloader
+downloader = ModelDownloader()
+# Download from HuggingFace
+path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
+# Download from ModelScope
+path = downloader.download(
+    "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
+    "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
+    source="modelscope"
+)
+```
+## CLI Commands
+| Command | Description |
+|---------|-------------|
+| `pyllama serve` | Start OpenAI-compatible server |
+| `pyllama run` | Run inference with a model |
+| `pyllama chat` | Interactive chat with a model |
+| `pyllama download` | Download a model |
+| `pyllama models` | List available models |
+| `pyllama devices` | List GPU devices |
+| `pyllama config` | Show optimal configuration |
+| `pyllama download-binaries` | Download pre-built binaries |
+| `pyllama build` | Build binaries from source |
+| `pyllama clear-cache` | Clear model/binary cache |
+## Popular Models
+| Name | Description | Sizes |
+|------|-------------|-------|
+| llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
+| llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
+| qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
+| gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
+| mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
+| phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
+| deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
+## GPU Backends
+| Backend | Platforms | Description |
+|---------|-----------|-------------|
+| Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
+| CUDA | Windows, Linux | NVIDIA GPUs |
+| ROCm | Linux | AMD GPUs |
+| Metal | macOS | Apple Silicon |
+| CPU | All | Fallback, no GPU required |
+## Function Calling
+```python
+from pyllama import Client, LlamaServer
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get weather for a location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string"}
+            }
+        }
+    }
+}]
+with LlamaServer("model.gguf") as server:
+    client = Client(server.base_url)
+    response = client.chat.completions.create(
+        model="llama",
+        messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
+        tools=tools
+    )
+    if response.choices[0]["message"].get("tool_calls"):
+        print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
+```
+## Requirements
+- Python 3.8+
+- Vulkan SDK (for Vulkan backend)
+- CUDA Toolkit (for CUDA backend)
+- ROCm (for ROCm backend)
+## License
+MIT License - same as llama.cpp
+## Links
+- [llama.cpp](https://github.com/ggml-org/llama.cpp)
+- [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
+- [Issues](https://github.com/ggml-org/llama.cpp/issues)

pyllama_server-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,176 @@
+# pyllama-server
+Python wrapper for llama.cpp server - OpenAI API compatible backend with automatic device detection and model downloading.
+## Features
+- **Auto Device Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
+- **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
+- **OpenAI API Compatible**: Drop-in replacement for OpenAI API
+- **Function Calling**: Support for tools and function calling
+- **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
+## Installation
+```bash
+pip install pyllama-server
+```
+## Quick Start
+### Command Line
+```bash
+# List available GPUs
+pyllama devices
+# Run inference with a model
+pyllama run ./model.gguf -p "Hello, world!"
+# Download a model
+pyllama download llama-3.2-3b -q Q4_K_M
+# Start an OpenAI-compatible server
+pyllama serve llama-3.2-3b -p 8080
+```
+### Python API
+```python
+from pyllama import quick_run, quick_server, Client
+# Quick inference
+result = quick_run("llama-3.2-3b", "Write a haiku about coding")
+print(result)
+# Start server with auto-configuration
+with quick_server("llama-3.2-3b") as server:
+    client = Client(server.base_url)
+    response = client.chat.completions.create(
+        model="llama",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+    print(response.choices[0]["message"]["content"])
+```
+### Auto Device Detection
+```python
+from pyllama import DeviceDetector, AutoRunner
+# Detect available GPUs
+detector = DeviceDetector()
+devices = detector.detect()
+for device in devices:
+    print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
+# Get optimal configuration
+config = detector.get_best_device(model_size_gb=5.0)
+print(f"Best device: {config.device.name}")
+print(f"Recommended GPU layers: {config.n_gpu_layers}")
+```
+### Model Download
+```python
+from pyllama import ModelDownloader
+downloader = ModelDownloader()
+# Download from HuggingFace
+path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
+# Download from ModelScope
+path = downloader.download(
+    "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
+    "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
+    source="modelscope"
+)
+```
+## CLI Commands
+| Command | Description |
+|---------|-------------|
+| `pyllama serve` | Start OpenAI-compatible server |
+| `pyllama run` | Run inference with a model |
+| `pyllama chat` | Interactive chat with a model |
+| `pyllama download` | Download a model |
+| `pyllama models` | List available models |
+| `pyllama devices` | List GPU devices |
+| `pyllama config` | Show optimal configuration |
+| `pyllama download-binaries` | Download pre-built binaries |
+| `pyllama build` | Build binaries from source |
+| `pyllama clear-cache` | Clear model/binary cache |
+## Popular Models
+| Name | Description | Sizes |
+|------|-------------|-------|
+| llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
+| llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
+| qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
+| gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
+| mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
+| phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
+| deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
+## GPU Backends
+| Backend | Platforms | Description |
+|---------|-----------|-------------|
+| Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
+| CUDA | Windows, Linux | NVIDIA GPUs |
+| ROCm | Linux | AMD GPUs |
+| Metal | macOS | Apple Silicon |
+| CPU | All | Fallback, no GPU required |
+## Function Calling
+```python
+from pyllama import Client, LlamaServer
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get weather for a location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {"type": "string"}
+            }
+        }
+    }
+}]
+with LlamaServer("model.gguf") as server:
+    client = Client(server.base_url)
+    response = client.chat.completions.create(
+        model="llama",
+        messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
+        tools=tools
+    )
+    if response.choices[0]["message"].get("tool_calls"):
+        print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
+```
+## Requirements
+- Python 3.8+
+- Vulkan SDK (for Vulkan backend)
+- CUDA Toolkit (for CUDA backend)
+- ROCm (for ROCm backend)
+## License
+MIT License - same as llama.cpp
+## Links
+- [llama.cpp](https://github.com/ggml-org/llama.cpp)
+- [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
+- [Issues](https://github.com/ggml-org/llama.cpp/issues)

pyllama_server-0.1.0/pyllama/__init__.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""
+pyllama - Python wrapper for llama.cpp server
+Provides an OpenAI API compatible interface for running GGUF models
+with GPU acceleration (Vulkan, CUDA, ROCm, Metal).
+Features:
+- Auto-detect best GPU device and backend
+- Download models from HuggingFace and ModelScope
+- OpenAI API compatible server
+- Function calling / tool support
+- Multimodal support
+Quick start:
+    from pyllama import quick_run, quick_server
+    # Quick inference
+    result = quick_run("llama-3.2-3b", "Write a haiku")
+    # Start server
+    with quick_server("llama-3.2-3b") as server:
+        # Use OpenAI API at http://localhost:8080/v1
+        pass
+"""
+__version__ = "0.1.0"
+from pyllama.client import Client, ChatCompletion, Message
+from pyllama.server import LlamaServer, ServerConfig, GPUInfo
+from pyllama.device import (
+    Device, DeviceConfig, DeviceDetector, BackendType,
+    detect_best_backend, get_device_config
+)
+from pyllama.models import (
+    ModelDownloader, ModelInfo, ModelRegistry, download_model
+)
+from pyllama.runner import (
+    AutoRunner, RunConfig, quick_run, quick_server
+)
+from pyllama.binaries import (
+    BinaryManager, get_binary_manager, ensure_binaries, get_server_binary
+)
+__all__ = [
+    # Client
+    "Client",
+    "ChatCompletion",
+    "Message",
+    # Server
+    "LlamaServer",
+    "ServerConfig",
+    "GPUInfo",
+    # Device
+    "Device",
+    "DeviceConfig",
+    "DeviceDetector",
+    "BackendType",
+    "detect_best_backend",
+    "get_device_config",
+    # Models
+    "ModelDownloader",
+    "ModelInfo",
+    "ModelRegistry",
+    "download_model",
+    # Runner
+    "AutoRunner",
+    "RunConfig",
+    "quick_run",
+    "quick_server",
+    # Binaries
+    "BinaryManager",
+    "get_binary_manager",
+    "ensure_binaries",
+    "get_server_binary",
+]