llama-cpp-py 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp_py-0.1.11/LICENSE +21 -0
- llama_cpp_py-0.1.11/PKG-INFO +229 -0
- llama_cpp_py-0.1.11/README.md +205 -0
- llama_cpp_py-0.1.11/llama_cpp_py/__init__.py +5 -0
- llama_cpp_py-0.1.11/llama_cpp_py/client/__init__.py +0 -0
- llama_cpp_py-0.1.11/llama_cpp_py/logger.py +4 -0
- llama_cpp_py-0.1.11/llama_cpp_py/release_manager/__init__.py +0 -0
- llama_cpp_py-0.1.11/llama_cpp_py/release_manager/base.py +209 -0
- llama_cpp_py-0.1.11/llama_cpp_py/release_manager/manager.py +62 -0
- llama_cpp_py-0.1.11/llama_cpp_py/server/__init__.py +4 -0
- llama_cpp_py-0.1.11/llama_cpp_py/server/async_.py +85 -0
- llama_cpp_py-0.1.11/llama_cpp_py/server/base.py +45 -0
- llama_cpp_py-0.1.11/llama_cpp_py/server/sync.py +91 -0
- llama_cpp_py-0.1.11/llama_cpp_py.egg-info/PKG-INFO +229 -0
- llama_cpp_py-0.1.11/llama_cpp_py.egg-info/SOURCES.txt +21 -0
- llama_cpp_py-0.1.11/llama_cpp_py.egg-info/dependency_links.txt +1 -0
- llama_cpp_py-0.1.11/llama_cpp_py.egg-info/requires.txt +11 -0
- llama_cpp_py-0.1.11/llama_cpp_py.egg-info/top_level.txt +1 -0
- llama_cpp_py-0.1.11/pyproject.toml +37 -0
- llama_cpp_py-0.1.11/setup.cfg +4 -0
- llama_cpp_py-0.1.11/tests/test_async_completions.py +56 -0
- llama_cpp_py-0.1.11/tests/test_manager.py +0 -0
- llama_cpp_py-0.1.11/tests/test_sync_completions.py +54 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 sergey21000
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llama-cpp-py
|
|
3
|
+
Version: 0.1.11
|
|
4
|
+
Summary: Python wrapper for running the llama.cpp server
|
|
5
|
+
Author: sergey21000
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/sergey21000/llama_cpp_py
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: aiohttp==3.13.1
|
|
14
|
+
Requires-Dist: requests==2.32.5
|
|
15
|
+
Requires-Dist: tqdm==4.67.1
|
|
16
|
+
Requires-Dist: openai==2.5.0
|
|
17
|
+
Requires-Dist: python-dotenv==1.1.1
|
|
18
|
+
Requires-Dist: platformdirs==4.5.0
|
|
19
|
+
Provides-Extra: test
|
|
20
|
+
Requires-Dist: pytest==8.4.2; extra == "test"
|
|
21
|
+
Requires-Dist: pytest-asyncio>=1.2.0; extra == "test"
|
|
22
|
+
Requires-Dist: colorama==0.4.6; extra == "test"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# llama-cpp-py
|
|
28
|
+
|
|
29
|
+
Python wrapper for running the [llama.cpp](https://github.com/ggml-org/llama.cpp) server with automatic or manual binary management.
|
|
30
|
+
Runs the server in a separate subprocess supporting both synchronous and asynchronous APIs.
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## Requirements
|
|
34
|
+
|
|
35
|
+
Python 3.10 or higher.
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
## Installation
|
|
39
|
+
|
|
40
|
+
Install from PyPI
|
|
41
|
+
```sh
|
|
42
|
+
pip install llama-cpp-py
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Install from source
|
|
46
|
+
```sh
|
|
47
|
+
git clone https://github.com/sergey21000/llama-cpp-py
|
|
48
|
+
cd llama-cpp-py
|
|
49
|
+
pip install -e .
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
More examples in the Google Colab notebook <a href="https://colab.research.google.com/drive/17f6tD5TM9EP52-3NZtZ1qQ-QrrLUTBEG"><img src="https://img.shields.io/static/v1?message=Open%20in%20Colab&logo=googlecolab&labelColor=5c5c5c&color=0f80c1&label=%20" alt="Open in Colab"></a>
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
### 1. Set up environment file for llama.cpp
|
|
59
|
+
|
|
60
|
+
Creating an `env.llama` file with variables for llama.cpp server
|
|
61
|
+
```sh
|
|
62
|
+
# download example env file
|
|
63
|
+
wget https://github.com/sergey21000/llama-cpp-py/raw/main/env.llama
|
|
64
|
+
# or create manually
|
|
65
|
+
nano env.llama
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Example `env.llama` content:
|
|
69
|
+
```env
|
|
70
|
+
# llama.cpp server environment variables
|
|
71
|
+
# See: https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#usage
|
|
72
|
+
|
|
73
|
+
# Model source
|
|
74
|
+
LLAMA_ARG_HF_REPO=bartowski/Qwen_Qwen3-4B-GGUF
|
|
75
|
+
LLAMA_ARG_HF_FILE=Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
76
|
+
|
|
77
|
+
# Alternative model options
|
|
78
|
+
# LLAMA_ARG_MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-4B-GGUF/resolve/main/Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
79
|
+
# LLAMA_ARG_MODEL=gguf_models/Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
80
|
+
# LLAMA_ARG_MODEL=D:/models/Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
81
|
+
|
|
82
|
+
# Server configuration
|
|
83
|
+
LLAMA_ARG_JINJA=1
|
|
84
|
+
LLAMA_ARG_CTX_SIZE=4096
|
|
85
|
+
LLAMA_ARG_NO_WEBUI=1
|
|
86
|
+
LLAMA_ARG_N_PARALLEL=1
|
|
87
|
+
LLAMA_ARG_N_GPU_LAYERS=-1
|
|
88
|
+
|
|
89
|
+
# Network endpoint
|
|
90
|
+
LLAMA_ARG_PORT=8080
|
|
91
|
+
LLAMA_ARG_HOST=127.0.0.1
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### 2. Launch the server and send requests
|
|
95
|
+
|
|
96
|
+
Launching a synchronous server based on the latest [llama.cpp release](https://github.com/ggml-org/llama.cpp/releases) version
|
|
97
|
+
```python
|
|
98
|
+
import os
|
|
99
|
+
from dotenv import dotenv_values
|
|
100
|
+
from openai import OpenAI
|
|
101
|
+
from llama_cpp_py import LlamaSyncServer
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# environment variables for llama.cpp
|
|
105
|
+
llama_env = dotenv_values('env.llama')
|
|
106
|
+
llama_env.update(os.environ)
|
|
107
|
+
|
|
108
|
+
# auto-download last release and start server
|
|
109
|
+
# set verbose=True to display server logs
|
|
110
|
+
server = LlamaSyncServer()
|
|
111
|
+
server.start(verbose=False, env=llama_env)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# sending requests with OpenAI client
|
|
115
|
+
client = OpenAI(
|
|
116
|
+
base_url=server.server_url + '/v1',
|
|
117
|
+
api_key='sk-no-key-required',
|
|
118
|
+
)
|
|
119
|
+
response = client.chat.completions.create(
|
|
120
|
+
model='local',
|
|
121
|
+
messages=[{'role': 'user', 'content': 'Hello!'}]
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# stopping the server
|
|
125
|
+
server.stop()
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Launching an asynchronous server based on a specific release version
|
|
129
|
+
```python
|
|
130
|
+
import os
|
|
131
|
+
import asyncio
|
|
132
|
+
from openai import AsyncOpenAI
|
|
133
|
+
from dotenv import dotenv_values
|
|
134
|
+
from llama_cpp_py import LlamaAsyncServer, LlamaReleaseManager
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# environment variables for llama.cpp
|
|
138
|
+
llama_env = dotenv_values('env.llama')
|
|
139
|
+
llama_env.update(os.environ)
|
|
140
|
+
|
|
141
|
+
# a) download a release by a specific tag with the 'cuda' priority in the title
|
|
142
|
+
# set tag='latest' to use the latest llama.cpp release version
|
|
143
|
+
# optionally specify priority_patterns to prefer certain builds (e.g. 'cuda' or 'cpu')
|
|
144
|
+
release_manager = LlamaReleaseManager(tag='b6780', priority_patterns=['cuda'])
|
|
145
|
+
|
|
146
|
+
# b) or set a specific release url in zip format
|
|
147
|
+
# release_manager = LlamaReleaseManager(
|
|
148
|
+
# release_zip_url='https://github.com/ggml-org/llama.cpp/releases/download/b6780/llama-b6780-bin-win-cuda-12.4-x64.zip'
|
|
149
|
+
# )
|
|
150
|
+
|
|
151
|
+
# c) or selecting the compiled directory llama.cpp
|
|
152
|
+
# release_manager = LlamaReleaseManager(release_dir='/content/llama.cpp/build/bin')
|
|
153
|
+
|
|
154
|
+
async def main():
|
|
155
|
+
# start llama.cpp server (set verbose=True to display server logs)
|
|
156
|
+
llama_server = LlamaAsyncServer(verbose=False, release_manager=release_manager)
|
|
157
|
+
await llama_server.start(env=llama_env)
|
|
158
|
+
|
|
159
|
+
# sending requests with OpenAI client
|
|
160
|
+
client = AsyncOpenAI(
|
|
161
|
+
base_url=f'{llama_server.server_url}/v1',
|
|
162
|
+
api_key='sk-no-key-required',
|
|
163
|
+
)
|
|
164
|
+
stream_response = await client.chat.completions.create(
|
|
165
|
+
model='local',
|
|
166
|
+
messages=[{'role': 'user', 'content': 'How are you?'}],
|
|
167
|
+
stream=True,
|
|
168
|
+
temperature=0.8,
|
|
169
|
+
max_tokens=-1,
|
|
170
|
+
extra_body=dict(
|
|
171
|
+
top_k=40,
|
|
172
|
+
reasoning_format='none',
|
|
173
|
+
chat_template_kwargs=dict(
|
|
174
|
+
enable_thinking=True,
|
|
175
|
+
),
|
|
176
|
+
),
|
|
177
|
+
)
|
|
178
|
+
full_response = ''
|
|
179
|
+
async for chunk in stream_response:
|
|
180
|
+
if (token := chunk.choices[0].delta.content) is not None:
|
|
181
|
+
full_response += token
|
|
182
|
+
print(token, end='', flush=True)
|
|
183
|
+
|
|
184
|
+
# stopping the server
|
|
185
|
+
await llama_server.stop()
|
|
186
|
+
|
|
187
|
+
if __name__ == '__main__':
|
|
188
|
+
asyncio.run(main())
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
## Troubleshooting
|
|
193
|
+
|
|
194
|
+
If the server fails to start or behaves unexpectedly, check the following:
|
|
195
|
+
- Check that the model path or URL in `env.llama` is correct
|
|
196
|
+
- Verify that the port is not already in use
|
|
197
|
+
- Try setting `verbose=True` to see server logs
|
|
198
|
+
```python
|
|
199
|
+
llama_server = LlamaAsyncServer(verbose=True)
|
|
200
|
+
```
|
|
201
|
+
- Link to the [llama.cpp release](https://github.com/ggml-org/llama.cpp/releases) archive appropriate for your system via
|
|
202
|
+
```python
|
|
203
|
+
LlamaReleaseManager(release_zip_url=url)
|
|
204
|
+
```
|
|
205
|
+
- Or use the path to the directory with the pre-compiled llama.cpp
|
|
206
|
+
```python
|
|
207
|
+
LlamaReleaseManager(release_dir=path_to_binaries)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
llama.cpp binary releases are downloaded to:
|
|
211
|
+
- **Windows**
|
|
212
|
+
```
|
|
213
|
+
%LOCALAPPDATA%\llama-cpp-py\releases
|
|
214
|
+
```
|
|
215
|
+
- **Linux**
|
|
216
|
+
```
|
|
217
|
+
~/.local/share/llama-cpp-py/releases
|
|
218
|
+
```
|
|
219
|
+
- **MacOS**
|
|
220
|
+
```
|
|
221
|
+
~/Library/Application Support/llama-cpp-py/releases
|
|
222
|
+
```
|
|
223
|
+
See [platformdirs examle output](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
## License
|
|
227
|
+
|
|
228
|
+
This project is licensed under the terms of the [MIT](./LICENSE) license.
|
|
229
|
+
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
# llama-cpp-py
|
|
4
|
+
|
|
5
|
+
Python wrapper for running the [llama.cpp](https://github.com/ggml-org/llama.cpp) server with automatic or manual binary management.
|
|
6
|
+
Runs the server in a separate subprocess supporting both synchronous and asynchronous APIs.
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
## Requirements
|
|
10
|
+
|
|
11
|
+
Python 3.10 or higher.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
Install from PyPI
|
|
17
|
+
```sh
|
|
18
|
+
pip install llama-cpp-py
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Install from source
|
|
22
|
+
```sh
|
|
23
|
+
git clone https://github.com/sergey21000/llama-cpp-py
|
|
24
|
+
cd llama-cpp-py
|
|
25
|
+
pip install -e .
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
More examples in the Google Colab notebook <a href="https://colab.research.google.com/drive/17f6tD5TM9EP52-3NZtZ1qQ-QrrLUTBEG"><img src="https://img.shields.io/static/v1?message=Open%20in%20Colab&logo=googlecolab&labelColor=5c5c5c&color=0f80c1&label=%20" alt="Open in Colab"></a>
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
### 1. Set up environment file for llama.cpp
|
|
35
|
+
|
|
36
|
+
Creating an `env.llama` file with variables for llama.cpp server
|
|
37
|
+
```sh
|
|
38
|
+
# download example env file
|
|
39
|
+
wget https://github.com/sergey21000/llama-cpp-py/raw/main/env.llama
|
|
40
|
+
# or create manually
|
|
41
|
+
nano env.llama
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Example `env.llama` content:
|
|
45
|
+
```env
|
|
46
|
+
# llama.cpp server environment variables
|
|
47
|
+
# See: https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md#usage
|
|
48
|
+
|
|
49
|
+
# Model source
|
|
50
|
+
LLAMA_ARG_HF_REPO=bartowski/Qwen_Qwen3-4B-GGUF
|
|
51
|
+
LLAMA_ARG_HF_FILE=Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
52
|
+
|
|
53
|
+
# Alternative model options
|
|
54
|
+
# LLAMA_ARG_MODEL_URL=https://huggingface.co/bartowski/Qwen_Qwen3-4B-GGUF/resolve/main/Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
55
|
+
# LLAMA_ARG_MODEL=gguf_models/Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
56
|
+
# LLAMA_ARG_MODEL=D:/models/Qwen_Qwen3-4B-Q4_K_M.gguf
|
|
57
|
+
|
|
58
|
+
# Server configuration
|
|
59
|
+
LLAMA_ARG_JINJA=1
|
|
60
|
+
LLAMA_ARG_CTX_SIZE=4096
|
|
61
|
+
LLAMA_ARG_NO_WEBUI=1
|
|
62
|
+
LLAMA_ARG_N_PARALLEL=1
|
|
63
|
+
LLAMA_ARG_N_GPU_LAYERS=-1
|
|
64
|
+
|
|
65
|
+
# Network endpoint
|
|
66
|
+
LLAMA_ARG_PORT=8080
|
|
67
|
+
LLAMA_ARG_HOST=127.0.0.1
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### 2. Launch the server and send requests
|
|
71
|
+
|
|
72
|
+
Launching a synchronous server based on the latest [llama.cpp release](https://github.com/ggml-org/llama.cpp/releases) version
|
|
73
|
+
```python
|
|
74
|
+
import os
|
|
75
|
+
from dotenv import dotenv_values
|
|
76
|
+
from openai import OpenAI
|
|
77
|
+
from llama_cpp_py import LlamaSyncServer
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# environment variables for llama.cpp
|
|
81
|
+
llama_env = dotenv_values('env.llama')
|
|
82
|
+
llama_env.update(os.environ)
|
|
83
|
+
|
|
84
|
+
# auto-download last release and start server
|
|
85
|
+
# set verbose=True to display server logs
|
|
86
|
+
server = LlamaSyncServer()
|
|
87
|
+
server.start(verbose=False, env=llama_env)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# sending requests with OpenAI client
|
|
91
|
+
client = OpenAI(
|
|
92
|
+
base_url=server.server_url + '/v1',
|
|
93
|
+
api_key='sk-no-key-required',
|
|
94
|
+
)
|
|
95
|
+
response = client.chat.completions.create(
|
|
96
|
+
model='local',
|
|
97
|
+
messages=[{'role': 'user', 'content': 'Hello!'}]
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# stopping the server
|
|
101
|
+
server.stop()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Launching an asynchronous server based on a specific release version
|
|
105
|
+
```python
|
|
106
|
+
import os
|
|
107
|
+
import asyncio
|
|
108
|
+
from openai import AsyncOpenAI
|
|
109
|
+
from dotenv import dotenv_values
|
|
110
|
+
from llama_cpp_py import LlamaAsyncServer, LlamaReleaseManager
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# environment variables for llama.cpp
|
|
114
|
+
llama_env = dotenv_values('env.llama')
|
|
115
|
+
llama_env.update(os.environ)
|
|
116
|
+
|
|
117
|
+
# a) download a release by a specific tag with the 'cuda' priority in the title
|
|
118
|
+
# set tag='latest' to use the latest llama.cpp release version
|
|
119
|
+
# optionally specify priority_patterns to prefer certain builds (e.g. 'cuda' or 'cpu')
|
|
120
|
+
release_manager = LlamaReleaseManager(tag='b6780', priority_patterns=['cuda'])
|
|
121
|
+
|
|
122
|
+
# b) or set a specific release url in zip format
|
|
123
|
+
# release_manager = LlamaReleaseManager(
|
|
124
|
+
# release_zip_url='https://github.com/ggml-org/llama.cpp/releases/download/b6780/llama-b6780-bin-win-cuda-12.4-x64.zip'
|
|
125
|
+
# )
|
|
126
|
+
|
|
127
|
+
# c) or selecting the compiled directory llama.cpp
|
|
128
|
+
# release_manager = LlamaReleaseManager(release_dir='/content/llama.cpp/build/bin')
|
|
129
|
+
|
|
130
|
+
async def main():
|
|
131
|
+
# start llama.cpp server (set verbose=True to display server logs)
|
|
132
|
+
llama_server = LlamaAsyncServer(verbose=False, release_manager=release_manager)
|
|
133
|
+
await llama_server.start(env=llama_env)
|
|
134
|
+
|
|
135
|
+
# sending requests with OpenAI client
|
|
136
|
+
client = AsyncOpenAI(
|
|
137
|
+
base_url=f'{llama_server.server_url}/v1',
|
|
138
|
+
api_key='sk-no-key-required',
|
|
139
|
+
)
|
|
140
|
+
stream_response = await client.chat.completions.create(
|
|
141
|
+
model='local',
|
|
142
|
+
messages=[{'role': 'user', 'content': 'How are you?'}],
|
|
143
|
+
stream=True,
|
|
144
|
+
temperature=0.8,
|
|
145
|
+
max_tokens=-1,
|
|
146
|
+
extra_body=dict(
|
|
147
|
+
top_k=40,
|
|
148
|
+
reasoning_format='none',
|
|
149
|
+
chat_template_kwargs=dict(
|
|
150
|
+
enable_thinking=True,
|
|
151
|
+
),
|
|
152
|
+
),
|
|
153
|
+
)
|
|
154
|
+
full_response = ''
|
|
155
|
+
async for chunk in stream_response:
|
|
156
|
+
if (token := chunk.choices[0].delta.content) is not None:
|
|
157
|
+
full_response += token
|
|
158
|
+
print(token, end='', flush=True)
|
|
159
|
+
|
|
160
|
+
# stopping the server
|
|
161
|
+
await llama_server.stop()
|
|
162
|
+
|
|
163
|
+
if __name__ == '__main__':
|
|
164
|
+
asyncio.run(main())
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
## Troubleshooting
|
|
169
|
+
|
|
170
|
+
If the server fails to start or behaves unexpectedly, check the following:
|
|
171
|
+
- Check that the model path or URL in `env.llama` is correct
|
|
172
|
+
- Verify that the port is not already in use
|
|
173
|
+
- Try setting `verbose=True` to see server logs
|
|
174
|
+
```python
|
|
175
|
+
llama_server = LlamaAsyncServer(verbose=True)
|
|
176
|
+
```
|
|
177
|
+
- Link to the [llama.cpp release](https://github.com/ggml-org/llama.cpp/releases) archive appropriate for your system via
|
|
178
|
+
```python
|
|
179
|
+
LlamaReleaseManager(release_zip_url=url)
|
|
180
|
+
```
|
|
181
|
+
- Or use the path to the directory with the pre-compiled llama.cpp
|
|
182
|
+
```python
|
|
183
|
+
LlamaReleaseManager(release_dir=path_to_binaries)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
llama.cpp binary releases are downloaded to:
|
|
187
|
+
- **Windows**
|
|
188
|
+
```
|
|
189
|
+
%LOCALAPPDATA%\llama-cpp-py\releases
|
|
190
|
+
```
|
|
191
|
+
- **Linux**
|
|
192
|
+
```
|
|
193
|
+
~/.local/share/llama-cpp-py/releases
|
|
194
|
+
```
|
|
195
|
+
- **MacOS**
|
|
196
|
+
```
|
|
197
|
+
~/Library/Application Support/llama-cpp-py/releases
|
|
198
|
+
```
|
|
199
|
+
See [platformdirs examle output](https://github.com/tox-dev/platformdirs?tab=readme-ov-file#example-output)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
## License
|
|
203
|
+
|
|
204
|
+
This project is licensed under the terms of the [MIT](./LICENSE) license.
|
|
205
|
+
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import platform
|
|
2
|
+
import zipfile
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from llama_cpp_py.logger import logger
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class GithubReleaseManager:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
releases_api_url: str,
|
|
15
|
+
releases_dir: str | Path,
|
|
16
|
+
tag: str = 'latest',
|
|
17
|
+
release_zip_url: str = '',
|
|
18
|
+
exclude_patterns: list[str] | None = None,
|
|
19
|
+
priority_patterns: list[str] | None = None,
|
|
20
|
+
):
|
|
21
|
+
self.validate_releases_api_url(releases_api_url)
|
|
22
|
+
self.releases_api_url = releases_api_url
|
|
23
|
+
self.releases_dir = Path(releases_dir)
|
|
24
|
+
if release_zip_url:
|
|
25
|
+
tag = self.get_tag_name_from_url(release_zip_url)
|
|
26
|
+
elif tag == 'latest':
|
|
27
|
+
tag = self.get_tag_name_from_url(self.releases_api_url + '/latest')
|
|
28
|
+
self.tag = tag
|
|
29
|
+
if not release_zip_url:
|
|
30
|
+
release_zip_url = self.get_release_zip_url(
|
|
31
|
+
tag=self.tag,
|
|
32
|
+
exclude_patterns=exclude_patterns,
|
|
33
|
+
priority_patterns=priority_patterns,
|
|
34
|
+
)
|
|
35
|
+
self.validate_release_zip_url(release_zip_url)
|
|
36
|
+
self.release_dir = self.releases_dir / Path(release_zip_url).stem
|
|
37
|
+
if not self.release_dir.exists():
|
|
38
|
+
self.download_and_extract_zip(
|
|
39
|
+
zip_url=release_zip_url,
|
|
40
|
+
extract_dir=self.release_dir,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def validate_releases_api_url(releases_api_url):
|
|
45
|
+
# https://api.github.com/repos/ggml-org/llama.cpp/releases
|
|
46
|
+
if not (
|
|
47
|
+
releases_api_url.startswith('https://api.github.com/repos/')
|
|
48
|
+
and releases_api_url.endswith('/releases')
|
|
49
|
+
):
|
|
50
|
+
raise ValueError(
|
|
51
|
+
'The URL with releases must start with '
|
|
52
|
+
'https://api.github.com/repos/ and end with /releases)'
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def validate_release_zip_url(release_zip_url: str) -> None:
|
|
57
|
+
# https://github.com/ggml-org/llama.cpp/releases/download/b6752/cudart-llama-bin-win-cuda-12.4-x64.zip
|
|
58
|
+
if not (
|
|
59
|
+
release_zip_url.startswith('https://github.com/')
|
|
60
|
+
and release_zip_url.endswith('.zip')
|
|
61
|
+
):
|
|
62
|
+
raise ValueError(
|
|
63
|
+
'The URL with release must start with '
|
|
64
|
+
'https://github.com/ and end with .zip)'
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def get_tag_name_from_url(url: str) -> str:
|
|
69
|
+
if url.endswith('/releases/latest') and 'api.github.com' in url:
|
|
70
|
+
response = requests.get(url)
|
|
71
|
+
response.raise_for_status()
|
|
72
|
+
release_data = response.json()
|
|
73
|
+
if not isinstance(release_data, dict):
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f'Returned a list instead of a dictionary at requests.get("{url}").json().\n'
|
|
76
|
+
'The URL does not lead to the page of one release'
|
|
77
|
+
)
|
|
78
|
+
tag_name = release_data.get('tag_name')
|
|
79
|
+
else:
|
|
80
|
+
tag_name = url.split('releases/download/')[-1].split('/')[0]
|
|
81
|
+
if not tag_name:
|
|
82
|
+
raise ValueError(f'Tag not found at {url}')
|
|
83
|
+
return tag_name
|
|
84
|
+
|
|
85
|
+
def get_release_zip_url(
|
|
86
|
+
self,
|
|
87
|
+
tag: str,
|
|
88
|
+
exclude_patterns: list[str] | None = None,
|
|
89
|
+
priority_patterns: list[str] | None = None,
|
|
90
|
+
) -> str:
|
|
91
|
+
zip_assets = self.get_release_zip_assets(tag=tag)
|
|
92
|
+
zip_asset = self.get_matched_asset(
|
|
93
|
+
assets=zip_assets,
|
|
94
|
+
exclude_patterns=exclude_patterns,
|
|
95
|
+
priority_patterns=priority_patterns,
|
|
96
|
+
)
|
|
97
|
+
return zip_asset['url']
|
|
98
|
+
|
|
99
|
+
def get_release_zip_assets(self, tag: str) -> list[dict[str, str]]:
|
|
100
|
+
'''Get all links to zip archives from the specified or latest release'''
|
|
101
|
+
api_url = f'{self.releases_api_url}/tags/{tag}'
|
|
102
|
+
response = requests.get(api_url)
|
|
103
|
+
response.raise_for_status()
|
|
104
|
+
release_data = response.json()
|
|
105
|
+
zip_assets = []
|
|
106
|
+
for asset in release_data['assets']:
|
|
107
|
+
if asset['name'].endswith('.zip'):
|
|
108
|
+
zip_assets.append({
|
|
109
|
+
'name': asset['name'],
|
|
110
|
+
'tag_name': release_data['tag_name'],
|
|
111
|
+
'url': asset['browser_download_url'],
|
|
112
|
+
'size': f'{asset["size"] // 1024**2} MB',
|
|
113
|
+
})
|
|
114
|
+
return zip_assets
|
|
115
|
+
|
|
116
|
+
def get_matched_asset(
|
|
117
|
+
self,
|
|
118
|
+
assets: list[dict[str, str]],
|
|
119
|
+
exclude_patterns: list[str] | None = None,
|
|
120
|
+
priority_patterns: list[str] | None = None,
|
|
121
|
+
) -> dict[str, str]:
|
|
122
|
+
'''Selects the appropriate archive based on the current OS and architecture'''
|
|
123
|
+
os_name, arch = self.detect_system()
|
|
124
|
+
matched_assets = []
|
|
125
|
+
for asset in assets:
|
|
126
|
+
name = asset['name'].lower()
|
|
127
|
+
if os_name not in name or arch not in name:
|
|
128
|
+
continue
|
|
129
|
+
if exclude_patterns and any(p in name for p in exclude_patterns):
|
|
130
|
+
continue
|
|
131
|
+
matched_assets.append(asset)
|
|
132
|
+
if not matched_assets:
|
|
133
|
+
raise RuntimeError(f'No suitable archive found for {os_name}-{arch}')
|
|
134
|
+
if priority_patterns:
|
|
135
|
+
for pattern in priority_patterns:
|
|
136
|
+
for asset in matched_assets:
|
|
137
|
+
name = asset['name'].lower()
|
|
138
|
+
if pattern in name:
|
|
139
|
+
matched_assets = [asset]
|
|
140
|
+
break
|
|
141
|
+
if len(matched_assets) == 1:
|
|
142
|
+
break
|
|
143
|
+
if len(matched_assets) > 1:
|
|
144
|
+
logger.warning(
|
|
145
|
+
f'More than one archive match found, the first one will be selected:'
|
|
146
|
+
f'{[d.get("name") for d in matched_assets]}'
|
|
147
|
+
)
|
|
148
|
+
return matched_assets[0]
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def detect_system() -> tuple[str, str]:
|
|
152
|
+
'''Determines the current platform (OS + architecture)'''
|
|
153
|
+
os_name = platform.system().lower()
|
|
154
|
+
arch = platform.machine().lower()
|
|
155
|
+
if os_name == 'windows':
|
|
156
|
+
os_name = 'win'
|
|
157
|
+
elif os_name == 'linux':
|
|
158
|
+
os_name = 'ubuntu'
|
|
159
|
+
elif os_name == 'darwin':
|
|
160
|
+
os_name = 'macos'
|
|
161
|
+
if arch in ('x86_64', 'amd64'):
|
|
162
|
+
arch = 'x64'
|
|
163
|
+
elif arch in ('arm64', 'aarch64'):
|
|
164
|
+
arch = 'arm64'
|
|
165
|
+
return os_name, arch
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def download_file(file_url: str, file_path: str | Path) -> None:
|
|
169
|
+
response = requests.get(file_url, stream=True)
|
|
170
|
+
if response.status_code != 200:
|
|
171
|
+
raise Exception(
|
|
172
|
+
f'The file is not available for download at the link: {file_url}'
|
|
173
|
+
)
|
|
174
|
+
total_size = int(response.headers.get('content-length', 0))
|
|
175
|
+
progress_tqdm = tqdm(
|
|
176
|
+
desc=f'Downoading release: {Path(file_path).name}',
|
|
177
|
+
total=total_size,
|
|
178
|
+
unit='iB',
|
|
179
|
+
unit_scale=True,
|
|
180
|
+
)
|
|
181
|
+
with open(file_path, 'wb') as file:
|
|
182
|
+
for data in response.iter_content(chunk_size=4096):
|
|
183
|
+
size = file.write(data)
|
|
184
|
+
progress_tqdm.update(size)
|
|
185
|
+
progress_tqdm.close()
|
|
186
|
+
|
|
187
|
+
@staticmethod
|
|
188
|
+
def extract_zip(zip_path: Path, extract_dir: Path) -> None:
|
|
189
|
+
with zipfile.ZipFile(zip_path, 'r') as archive:
|
|
190
|
+
archive.extractall(path=extract_dir)
|
|
191
|
+
|
|
192
|
+
@classmethod
|
|
193
|
+
def download_and_extract_zip(
|
|
194
|
+
cls,
|
|
195
|
+
zip_url: str,
|
|
196
|
+
extract_dir: Path,
|
|
197
|
+
override: bool = False,
|
|
198
|
+
set_execute_permissions: bool = True,
|
|
199
|
+
) -> None:
|
|
200
|
+
extract_dir.mkdir(exist_ok=True, parents=True)
|
|
201
|
+
zip_path = extract_dir / Path(zip_url).name
|
|
202
|
+
logger.info(f'Loading file {zip_url} to path {zip_path}')
|
|
203
|
+
cls.download_file(file_url=zip_url, file_path=zip_path)
|
|
204
|
+
cls.extract_zip(zip_path=zip_path, extract_dir=extract_dir)
|
|
205
|
+
zip_path.unlink(missing_ok=True)
|
|
206
|
+
if set_execute_permissions and platform.system() != 'Windows':
|
|
207
|
+
for file in extract_dir.rglob('*'):
|
|
208
|
+
if file.is_file():
|
|
209
|
+
file.chmod(0o755)
|