pyllama-server 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include pyproject.toml
3
+ include LICENSE
4
+ recursive-include pyllama *.py
5
+ recursive-exclude pyllama/bin *
6
+ include pyllama/bin/windows/.gitkeep
7
+ include pyllama/bin/linux/.gitkeep
8
+ include pyllama/bin/darwin/.gitkeep
@@ -0,0 +1,223 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyllama-server
3
+ Version: 0.1.0
4
+ Summary: Python wrapper for llama.cpp server - OpenAI API compatible backend with auto device detection
5
+ Author: llama.cpp community
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
8
+ Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
9
+ Project-URL: Repository, https://github.com/ggml-org/llama.cpp
10
+ Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
11
+ Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: httpx>=0.24.0
28
+ Requires-Dist: pydantic>=2.0.0
29
+ Requires-Dist: rich>=13.0.0
30
+ Requires-Dist: typer>=0.9.0
31
+ Requires-Dist: psutil>=5.9.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: build>=1.0.0; extra == "dev"
36
+ Requires-Dist: twine>=4.0.0; extra == "dev"
37
+ Provides-Extra: openai
38
+ Requires-Dist: openai>=1.0.0; extra == "openai"
39
+ Provides-Extra: hf
40
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
41
+ Provides-Extra: modelscope
42
+ Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
43
+ Provides-Extra: all
44
+ Requires-Dist: openai>=1.0.0; extra == "all"
45
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
46
+ Requires-Dist: modelscope>=1.10.0; extra == "all"
47
+
48
+ # pyllama-server
49
+
50
+ Python wrapper for llama.cpp server - OpenAI API compatible backend with automatic device detection and model downloading.
51
+
52
+ ## Features
53
+
54
+ - **Auto Device Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
55
+ - **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
56
+ - **OpenAI API Compatible**: Drop-in replacement for OpenAI API
57
+ - **Function Calling**: Support for tools and function calling
58
+ - **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
59
+
60
+ ## Installation
61
+
62
+ ```bash
63
+ pip install pyllama-server
64
+ ```
65
+
66
+ ## Quick Start
67
+
68
+ ### Command Line
69
+
70
+ ```bash
71
+ # List available GPUs
72
+ pyllama devices
73
+
74
+ # Run inference with a model
75
+ pyllama run ./model.gguf -p "Hello, world!"
76
+
77
+ # Download a model
78
+ pyllama download llama-3.2-3b -q Q4_K_M
79
+
80
+ # Start an OpenAI-compatible server
81
+ pyllama serve llama-3.2-3b -p 8080
82
+ ```
83
+
84
+ ### Python API
85
+
86
+ ```python
87
+ from pyllama import quick_run, quick_server, Client
88
+
89
+ # Quick inference
90
+ result = quick_run("llama-3.2-3b", "Write a haiku about coding")
91
+ print(result)
92
+
93
+ # Start server with auto-configuration
94
+ with quick_server("llama-3.2-3b") as server:
95
+ client = Client(server.base_url)
96
+
97
+ response = client.chat.completions.create(
98
+ model="llama",
99
+ messages=[{"role": "user", "content": "Hello!"}]
100
+ )
101
+ print(response.choices[0]["message"]["content"])
102
+ ```
103
+
104
+ ### Auto Device Detection
105
+
106
+ ```python
107
+ from pyllama import DeviceDetector, AutoRunner
108
+
109
+ # Detect available GPUs
110
+ detector = DeviceDetector()
111
+ devices = detector.detect()
112
+ for device in devices:
113
+ print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
114
+
115
+ # Get optimal configuration
116
+ config = detector.get_best_device(model_size_gb=5.0)
117
+ print(f"Best device: {config.device.name}")
118
+ print(f"Recommended GPU layers: {config.n_gpu_layers}")
119
+ ```
120
+
121
+ ### Model Download
122
+
123
+ ```python
124
+ from pyllama import ModelDownloader
125
+
126
+ downloader = ModelDownloader()
127
+
128
+ # Download from HuggingFace
129
+ path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
130
+
131
+ # Download from ModelScope
132
+ path = downloader.download(
133
+ "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
134
+ "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
135
+ source="modelscope"
136
+ )
137
+ ```
138
+
139
+ ## CLI Commands
140
+
141
+ | Command | Description |
142
+ |---------|-------------|
143
+ | `pyllama serve` | Start OpenAI-compatible server |
144
+ | `pyllama run` | Run inference with a model |
145
+ | `pyllama chat` | Interactive chat with a model |
146
+ | `pyllama download` | Download a model |
147
+ | `pyllama models` | List available models |
148
+ | `pyllama devices` | List GPU devices |
149
+ | `pyllama config` | Show optimal configuration |
150
+ | `pyllama download-binaries` | Download pre-built binaries |
151
+ | `pyllama build` | Build binaries from source |
152
+ | `pyllama clear-cache` | Clear model/binary cache |
153
+
154
+ ## Popular Models
155
+
156
+ | Name | Description | Sizes |
157
+ |------|-------------|-------|
158
+ | llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
159
+ | llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
160
+ | qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
161
+ | gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
162
+ | mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
163
+ | phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
164
+ | deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
165
+
166
+ ## GPU Backends
167
+
168
+ | Backend | Platforms | Description |
169
+ |---------|-----------|-------------|
170
+ | Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
171
+ | CUDA | Windows, Linux | NVIDIA GPUs |
172
+ | ROCm | Linux | AMD GPUs |
173
+ | Metal | macOS | Apple Silicon |
174
+ | CPU | All | Fallback, no GPU required |
175
+
176
+ ## Function Calling
177
+
178
+ ```python
179
+ from pyllama import Client, LlamaServer
180
+
181
+ tools = [{
182
+ "type": "function",
183
+ "function": {
184
+ "name": "get_weather",
185
+ "description": "Get weather for a location",
186
+ "parameters": {
187
+ "type": "object",
188
+ "properties": {
189
+ "location": {"type": "string"}
190
+ }
191
+ }
192
+ }
193
+ }]
194
+
195
+ with LlamaServer("model.gguf") as server:
196
+ client = Client(server.base_url)
197
+
198
+ response = client.chat.completions.create(
199
+ model="llama",
200
+ messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
201
+ tools=tools
202
+ )
203
+
204
+ if response.choices[0]["message"].get("tool_calls"):
205
+ print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
206
+ ```
207
+
208
+ ## Requirements
209
+
210
+ - Python 3.8+
211
+ - Vulkan SDK (for Vulkan backend)
212
+ - CUDA Toolkit (for CUDA backend)
213
+ - ROCm (for ROCm backend)
214
+
215
+ ## License
216
+
217
+ MIT License - same as llama.cpp
218
+
219
+ ## Links
220
+
221
+ - [llama.cpp](https://github.com/ggml-org/llama.cpp)
222
+ - [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
223
+ - [Issues](https://github.com/ggml-org/llama.cpp/issues)
@@ -0,0 +1,176 @@
1
+ # pyllama-server
2
+
3
+ Python wrapper for llama.cpp server - OpenAI API compatible backend with automatic device detection and model downloading.
4
+
5
+ ## Features
6
+
7
+ - **Auto Device Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
8
+ - **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
9
+ - **OpenAI API Compatible**: Drop-in replacement for OpenAI API
10
+ - **Function Calling**: Support for tools and function calling
11
+ - **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install pyllama-server
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ### Command Line
22
+
23
+ ```bash
24
+ # List available GPUs
25
+ pyllama devices
26
+
27
+ # Run inference with a model
28
+ pyllama run ./model.gguf -p "Hello, world!"
29
+
30
+ # Download a model
31
+ pyllama download llama-3.2-3b -q Q4_K_M
32
+
33
+ # Start an OpenAI-compatible server
34
+ pyllama serve llama-3.2-3b -p 8080
35
+ ```
36
+
37
+ ### Python API
38
+
39
+ ```python
40
+ from pyllama import quick_run, quick_server, Client
41
+
42
+ # Quick inference
43
+ result = quick_run("llama-3.2-3b", "Write a haiku about coding")
44
+ print(result)
45
+
46
+ # Start server with auto-configuration
47
+ with quick_server("llama-3.2-3b") as server:
48
+ client = Client(server.base_url)
49
+
50
+ response = client.chat.completions.create(
51
+ model="llama",
52
+ messages=[{"role": "user", "content": "Hello!"}]
53
+ )
54
+ print(response.choices[0]["message"]["content"])
55
+ ```
56
+
57
+ ### Auto Device Detection
58
+
59
+ ```python
60
+ from pyllama import DeviceDetector, AutoRunner
61
+
62
+ # Detect available GPUs
63
+ detector = DeviceDetector()
64
+ devices = detector.detect()
65
+ for device in devices:
66
+ print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
67
+
68
+ # Get optimal configuration
69
+ config = detector.get_best_device(model_size_gb=5.0)
70
+ print(f"Best device: {config.device.name}")
71
+ print(f"Recommended GPU layers: {config.n_gpu_layers}")
72
+ ```
73
+
74
+ ### Model Download
75
+
76
+ ```python
77
+ from pyllama import ModelDownloader
78
+
79
+ downloader = ModelDownloader()
80
+
81
+ # Download from HuggingFace
82
+ path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
83
+
84
+ # Download from ModelScope
85
+ path = downloader.download(
86
+ "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
87
+ "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
88
+ source="modelscope"
89
+ )
90
+ ```
91
+
92
+ ## CLI Commands
93
+
94
+ | Command | Description |
95
+ |---------|-------------|
96
+ | `pyllama serve` | Start OpenAI-compatible server |
97
+ | `pyllama run` | Run inference with a model |
98
+ | `pyllama chat` | Interactive chat with a model |
99
+ | `pyllama download` | Download a model |
100
+ | `pyllama models` | List available models |
101
+ | `pyllama devices` | List GPU devices |
102
+ | `pyllama config` | Show optimal configuration |
103
+ | `pyllama download-binaries` | Download pre-built binaries |
104
+ | `pyllama build` | Build binaries from source |
105
+ | `pyllama clear-cache` | Clear model/binary cache |
106
+
107
+ ## Popular Models
108
+
109
+ | Name | Description | Sizes |
110
+ |------|-------------|-------|
111
+ | llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
112
+ | llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
113
+ | qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
114
+ | gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
115
+ | mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
116
+ | phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
117
+ | deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
118
+
119
+ ## GPU Backends
120
+
121
+ | Backend | Platforms | Description |
122
+ |---------|-----------|-------------|
123
+ | Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
124
+ | CUDA | Windows, Linux | NVIDIA GPUs |
125
+ | ROCm | Linux | AMD GPUs |
126
+ | Metal | macOS | Apple Silicon |
127
+ | CPU | All | Fallback, no GPU required |
128
+
129
+ ## Function Calling
130
+
131
+ ```python
132
+ from pyllama import Client, LlamaServer
133
+
134
+ tools = [{
135
+ "type": "function",
136
+ "function": {
137
+ "name": "get_weather",
138
+ "description": "Get weather for a location",
139
+ "parameters": {
140
+ "type": "object",
141
+ "properties": {
142
+ "location": {"type": "string"}
143
+ }
144
+ }
145
+ }
146
+ }]
147
+
148
+ with LlamaServer("model.gguf") as server:
149
+ client = Client(server.base_url)
150
+
151
+ response = client.chat.completions.create(
152
+ model="llama",
153
+ messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
154
+ tools=tools
155
+ )
156
+
157
+ if response.choices[0]["message"].get("tool_calls"):
158
+ print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
159
+ ```
160
+
161
+ ## Requirements
162
+
163
+ - Python 3.8+
164
+ - Vulkan SDK (for Vulkan backend)
165
+ - CUDA Toolkit (for CUDA backend)
166
+ - ROCm (for ROCm backend)
167
+
168
+ ## License
169
+
170
+ MIT License - same as llama.cpp
171
+
172
+ ## Links
173
+
174
+ - [llama.cpp](https://github.com/ggml-org/llama.cpp)
175
+ - [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
176
+ - [Issues](https://github.com/ggml-org/llama.cpp/issues)
@@ -0,0 +1,80 @@
1
+ """
2
+ pyllama - Python wrapper for llama.cpp server
3
+
4
+ Provides an OpenAI API compatible interface for running GGUF models
5
+ with GPU acceleration (Vulkan, CUDA, ROCm, Metal).
6
+
7
+ Features:
8
+ - Auto-detect best GPU device and backend
9
+ - Download models from HuggingFace and ModelScope
10
+ - OpenAI API compatible server
11
+ - Function calling / tool support
12
+ - Multimodal support
13
+
14
+ Quick start:
15
+ from pyllama import quick_run, quick_server
16
+
17
+ # Quick inference
18
+ result = quick_run("llama-3.2-3b", "Write a haiku")
19
+
20
+ # Start server
21
+ with quick_server("llama-3.2-3b") as server:
22
+ # Use OpenAI API at http://localhost:8080/v1
23
+ pass
24
+ """
25
+
26
+ __version__ = "0.1.0"
27
+
28
+ from pyllama.client import Client, ChatCompletion, Message
29
+ from pyllama.server import LlamaServer, ServerConfig, GPUInfo
30
+ from pyllama.device import (
31
+ Device, DeviceConfig, DeviceDetector, BackendType,
32
+ detect_best_backend, get_device_config
33
+ )
34
+ from pyllama.models import (
35
+ ModelDownloader, ModelInfo, ModelRegistry, download_model
36
+ )
37
+ from pyllama.runner import (
38
+ AutoRunner, RunConfig, quick_run, quick_server
39
+ )
40
+ from pyllama.binaries import (
41
+ BinaryManager, get_binary_manager, ensure_binaries, get_server_binary
42
+ )
43
+
44
+ __all__ = [
45
+ # Client
46
+ "Client",
47
+ "ChatCompletion",
48
+ "Message",
49
+
50
+ # Server
51
+ "LlamaServer",
52
+ "ServerConfig",
53
+ "GPUInfo",
54
+
55
+ # Device
56
+ "Device",
57
+ "DeviceConfig",
58
+ "DeviceDetector",
59
+ "BackendType",
60
+ "detect_best_backend",
61
+ "get_device_config",
62
+
63
+ # Models
64
+ "ModelDownloader",
65
+ "ModelInfo",
66
+ "ModelRegistry",
67
+ "download_model",
68
+
69
+ # Runner
70
+ "AutoRunner",
71
+ "RunConfig",
72
+ "quick_run",
73
+ "quick_server",
74
+
75
+ # Binaries
76
+ "BinaryManager",
77
+ "get_binary_manager",
78
+ "ensure_binaries",
79
+ "get_server_binary",
80
+ ]