moxing 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- moxing-0.1.0/MANIFEST.in +8 -0
- moxing-0.1.0/PKG-INFO +233 -0
- moxing-0.1.0/README.md +186 -0
- moxing-0.1.0/moxing.egg-info/PKG-INFO +233 -0
- moxing-0.1.0/moxing.egg-info/SOURCES.txt +21 -0
- moxing-0.1.0/moxing.egg-info/dependency_links.txt +1 -0
- moxing-0.1.0/moxing.egg-info/entry_points.txt +2 -0
- moxing-0.1.0/moxing.egg-info/requires.txt +25 -0
- moxing-0.1.0/moxing.egg-info/top_level.txt +1 -0
- moxing-0.1.0/pyllm/__init__.py +80 -0
- moxing-0.1.0/pyllm/benchmark.py +409 -0
- moxing-0.1.0/pyllm/bin/darwin/.gitkeep +0 -0
- moxing-0.1.0/pyllm/bin/linux/.gitkeep +0 -0
- moxing-0.1.0/pyllm/bin/windows/.gitkeep +0 -0
- moxing-0.1.0/pyllm/binaries.py +335 -0
- moxing-0.1.0/pyllm/cli.py +698 -0
- moxing-0.1.0/pyllm/client.py +174 -0
- moxing-0.1.0/pyllm/device.py +327 -0
- moxing-0.1.0/pyllm/models.py +549 -0
- moxing-0.1.0/pyllm/runner.py +316 -0
- moxing-0.1.0/pyllm/server.py +266 -0
- moxing-0.1.0/pyproject.toml +83 -0
- moxing-0.1.0/setup.cfg +4 -0
moxing-0.1.0/MANIFEST.in
ADDED
moxing-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: moxing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python wrapper for llama.cpp - OpenAI API compatible LLM backend with auto GPU detection
|
|
5
|
+
Author: llama.cpp community
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
|
|
8
|
+
Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
|
|
9
|
+
Project-URL: Repository, https://github.com/ggml-org/llama.cpp
|
|
10
|
+
Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
|
|
11
|
+
Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
Requires-Dist: httpx>=0.24.0
|
|
28
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
|
+
Requires-Dist: rich>=13.0.0
|
|
30
|
+
Requires-Dist: typer>=0.9.0
|
|
31
|
+
Requires-Dist: psutil>=5.9.0
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
35
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
37
|
+
Provides-Extra: openai
|
|
38
|
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
|
39
|
+
Provides-Extra: hf
|
|
40
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
|
|
41
|
+
Provides-Extra: modelscope
|
|
42
|
+
Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
|
|
43
|
+
Provides-Extra: all
|
|
44
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
45
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
|
|
46
|
+
Requires-Dist: modelscope>=1.10.0; extra == "all"
|
|
47
|
+
|
|
48
|
+
# moxing (模型)
|
|
49
|
+
|
|
50
|
+
Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
|
|
51
|
+
|
|
52
|
+
**moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
- **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
|
|
57
|
+
- **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
|
|
58
|
+
- **OpenAI API Compatible**: Drop-in replacement for OpenAI API
|
|
59
|
+
- **Function Calling**: Support for tools and function calling
|
|
60
|
+
- **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
|
|
61
|
+
- **Benchmark**: Measure tokens/second performance like ollama
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install moxing
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
### Command Line
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# List available GPUs
|
|
75
|
+
pyllm devices
|
|
76
|
+
|
|
77
|
+
# Run inference with a model
|
|
78
|
+
pyllm run ./model.gguf -p "Hello, world!"
|
|
79
|
+
|
|
80
|
+
# Quick speed test
|
|
81
|
+
pyllm speed ./model.gguf
|
|
82
|
+
|
|
83
|
+
# Benchmark performance
|
|
84
|
+
pyllm bench ./model.gguf
|
|
85
|
+
|
|
86
|
+
# Download a model
|
|
87
|
+
pyllm download llama-3.2-3b -q Q4_K_M
|
|
88
|
+
|
|
89
|
+
# Start an OpenAI-compatible server
|
|
90
|
+
pyllm serve llama-3.2-3b -p 8080
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Python API
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from pyllm import quick_run, quick_server, Client
|
|
97
|
+
|
|
98
|
+
# Quick inference
|
|
99
|
+
result = quick_run("llama-3.2-3b", "Write a haiku about coding")
|
|
100
|
+
print(result)
|
|
101
|
+
|
|
102
|
+
# Start server with auto-configuration
|
|
103
|
+
with quick_server("llama-3.2-3b") as server:
|
|
104
|
+
client = Client(server.base_url)
|
|
105
|
+
|
|
106
|
+
response = client.chat.completions.create(
|
|
107
|
+
model="llama",
|
|
108
|
+
messages=[{"role": "user", "content": "Hello!"}]
|
|
109
|
+
)
|
|
110
|
+
print(response.choices[0]["message"]["content"])
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Auto GPU Detection
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from pyllm import DeviceDetector, AutoRunner
|
|
117
|
+
|
|
118
|
+
# Detect available GPUs
|
|
119
|
+
detector = DeviceDetector()
|
|
120
|
+
devices = detector.detect()
|
|
121
|
+
for device in devices:
|
|
122
|
+
print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
|
|
123
|
+
|
|
124
|
+
# Get optimal configuration
|
|
125
|
+
config = detector.get_best_device(model_size_gb=5.0)
|
|
126
|
+
print(f"Best device: {config.device.name}")
|
|
127
|
+
print(f"Recommended GPU layers: {config.n_gpu_layers}")
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Model Download
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from pyllm import ModelDownloader
|
|
134
|
+
|
|
135
|
+
downloader = ModelDownloader()
|
|
136
|
+
|
|
137
|
+
# Download from HuggingFace
|
|
138
|
+
path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
|
|
139
|
+
|
|
140
|
+
# Download from ModelScope
|
|
141
|
+
path = downloader.download(
|
|
142
|
+
"LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
|
|
143
|
+
"Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
|
|
144
|
+
source="modelscope"
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## CLI Commands
|
|
149
|
+
|
|
150
|
+
| Command | Description |
|
|
151
|
+
|---------|-------------|
|
|
152
|
+
| `pyllm serve` | Start OpenAI-compatible server |
|
|
153
|
+
| `pyllm run` | Run inference with a model |
|
|
154
|
+
| `pyllm chat` | Interactive chat with a model |
|
|
155
|
+
| `pyllm bench` | Benchmark model performance |
|
|
156
|
+
| `pyllm speed` | Quick speed test |
|
|
157
|
+
| `pyllm info` | Show model info and estimates |
|
|
158
|
+
| `pyllm download` | Download a model |
|
|
159
|
+
| `pyllm models` | List available models |
|
|
160
|
+
| `pyllm devices` | List GPU devices |
|
|
161
|
+
| `pyllm config` | Show optimal configuration |
|
|
162
|
+
| `pyllm diagnose` | Diagnose system setup |
|
|
163
|
+
|
|
164
|
+
## Popular Models
|
|
165
|
+
|
|
166
|
+
| Name | Description | Sizes |
|
|
167
|
+
|------|-------------|-------|
|
|
168
|
+
| llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
169
|
+
| llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
170
|
+
| qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
171
|
+
| gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
172
|
+
| mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
|
|
173
|
+
| phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
|
|
174
|
+
| deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
|
|
175
|
+
|
|
176
|
+
## GPU Backends
|
|
177
|
+
|
|
178
|
+
| Backend | Platforms | Description |
|
|
179
|
+
|---------|-----------|-------------|
|
|
180
|
+
| Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
|
|
181
|
+
| CUDA | Windows, Linux | NVIDIA GPUs |
|
|
182
|
+
| ROCm | Linux | AMD GPUs |
|
|
183
|
+
| Metal | macOS | Apple Silicon |
|
|
184
|
+
| CPU | All | Fallback, no GPU required |
|
|
185
|
+
|
|
186
|
+
## Function Calling
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from pyllm import Client, LlamaServer
|
|
190
|
+
|
|
191
|
+
tools = [{
|
|
192
|
+
"type": "function",
|
|
193
|
+
"function": {
|
|
194
|
+
"name": "get_weather",
|
|
195
|
+
"description": "Get weather for a location",
|
|
196
|
+
"parameters": {
|
|
197
|
+
"type": "object",
|
|
198
|
+
"properties": {
|
|
199
|
+
"location": {"type": "string"}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}]
|
|
204
|
+
|
|
205
|
+
with LlamaServer("model.gguf") as server:
|
|
206
|
+
client = Client(server.base_url)
|
|
207
|
+
|
|
208
|
+
response = client.chat.completions.create(
|
|
209
|
+
model="llama",
|
|
210
|
+
messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
211
|
+
tools=tools
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if response.choices[0]["message"].get("tool_calls"):
|
|
215
|
+
print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Requirements
|
|
219
|
+
|
|
220
|
+
- Python 3.8+
|
|
221
|
+
- Vulkan SDK (for Vulkan backend)
|
|
222
|
+
- CUDA Toolkit (for CUDA backend)
|
|
223
|
+
- ROCm (for ROCm backend)
|
|
224
|
+
|
|
225
|
+
## License
|
|
226
|
+
|
|
227
|
+
MIT License - same as llama.cpp
|
|
228
|
+
|
|
229
|
+
## Links
|
|
230
|
+
|
|
231
|
+
- [llama.cpp](https://github.com/ggml-org/llama.cpp)
|
|
232
|
+
- [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
|
|
233
|
+
- [Issues](https://github.com/ggml-org/llama.cpp/issues)
|
moxing-0.1.0/README.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# moxing (模型)
|
|
2
|
+
|
|
3
|
+
Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
|
|
4
|
+
|
|
5
|
+
**moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
|
|
10
|
+
- **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
|
|
11
|
+
- **OpenAI API Compatible**: Drop-in replacement for OpenAI API
|
|
12
|
+
- **Function Calling**: Support for tools and function calling
|
|
13
|
+
- **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
|
|
14
|
+
- **Benchmark**: Measure tokens/second performance like ollama
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install moxing
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
### Command Line
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# List available GPUs
|
|
28
|
+
pyllm devices
|
|
29
|
+
|
|
30
|
+
# Run inference with a model
|
|
31
|
+
pyllm run ./model.gguf -p "Hello, world!"
|
|
32
|
+
|
|
33
|
+
# Quick speed test
|
|
34
|
+
pyllm speed ./model.gguf
|
|
35
|
+
|
|
36
|
+
# Benchmark performance
|
|
37
|
+
pyllm bench ./model.gguf
|
|
38
|
+
|
|
39
|
+
# Download a model
|
|
40
|
+
pyllm download llama-3.2-3b -q Q4_K_M
|
|
41
|
+
|
|
42
|
+
# Start an OpenAI-compatible server
|
|
43
|
+
pyllm serve llama-3.2-3b -p 8080
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Python API
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from pyllm import quick_run, quick_server, Client
|
|
50
|
+
|
|
51
|
+
# Quick inference
|
|
52
|
+
result = quick_run("llama-3.2-3b", "Write a haiku about coding")
|
|
53
|
+
print(result)
|
|
54
|
+
|
|
55
|
+
# Start server with auto-configuration
|
|
56
|
+
with quick_server("llama-3.2-3b") as server:
|
|
57
|
+
client = Client(server.base_url)
|
|
58
|
+
|
|
59
|
+
response = client.chat.completions.create(
|
|
60
|
+
model="llama",
|
|
61
|
+
messages=[{"role": "user", "content": "Hello!"}]
|
|
62
|
+
)
|
|
63
|
+
print(response.choices[0]["message"]["content"])
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Auto GPU Detection
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from pyllm import DeviceDetector, AutoRunner
|
|
70
|
+
|
|
71
|
+
# Detect available GPUs
|
|
72
|
+
detector = DeviceDetector()
|
|
73
|
+
devices = detector.detect()
|
|
74
|
+
for device in devices:
|
|
75
|
+
print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
|
|
76
|
+
|
|
77
|
+
# Get optimal configuration
|
|
78
|
+
config = detector.get_best_device(model_size_gb=5.0)
|
|
79
|
+
print(f"Best device: {config.device.name}")
|
|
80
|
+
print(f"Recommended GPU layers: {config.n_gpu_layers}")
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Model Download
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from pyllm import ModelDownloader
|
|
87
|
+
|
|
88
|
+
downloader = ModelDownloader()
|
|
89
|
+
|
|
90
|
+
# Download from HuggingFace
|
|
91
|
+
path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
|
|
92
|
+
|
|
93
|
+
# Download from ModelScope
|
|
94
|
+
path = downloader.download(
|
|
95
|
+
"LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
|
|
96
|
+
"Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
|
|
97
|
+
source="modelscope"
|
|
98
|
+
)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## CLI Commands
|
|
102
|
+
|
|
103
|
+
| Command | Description |
|
|
104
|
+
|---------|-------------|
|
|
105
|
+
| `pyllm serve` | Start OpenAI-compatible server |
|
|
106
|
+
| `pyllm run` | Run inference with a model |
|
|
107
|
+
| `pyllm chat` | Interactive chat with a model |
|
|
108
|
+
| `pyllm bench` | Benchmark model performance |
|
|
109
|
+
| `pyllm speed` | Quick speed test |
|
|
110
|
+
| `pyllm info` | Show model info and estimates |
|
|
111
|
+
| `pyllm download` | Download a model |
|
|
112
|
+
| `pyllm models` | List available models |
|
|
113
|
+
| `pyllm devices` | List GPU devices |
|
|
114
|
+
| `pyllm config` | Show optimal configuration |
|
|
115
|
+
| `pyllm diagnose` | Diagnose system setup |
|
|
116
|
+
|
|
117
|
+
## Popular Models
|
|
118
|
+
|
|
119
|
+
| Name | Description | Sizes |
|
|
120
|
+
|------|-------------|-------|
|
|
121
|
+
| llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
122
|
+
| llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
123
|
+
| qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
124
|
+
| gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
125
|
+
| mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
|
|
126
|
+
| phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
|
|
127
|
+
| deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
|
|
128
|
+
|
|
129
|
+
## GPU Backends
|
|
130
|
+
|
|
131
|
+
| Backend | Platforms | Description |
|
|
132
|
+
|---------|-----------|-------------|
|
|
133
|
+
| Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
|
|
134
|
+
| CUDA | Windows, Linux | NVIDIA GPUs |
|
|
135
|
+
| ROCm | Linux | AMD GPUs |
|
|
136
|
+
| Metal | macOS | Apple Silicon |
|
|
137
|
+
| CPU | All | Fallback, no GPU required |
|
|
138
|
+
|
|
139
|
+
## Function Calling
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from pyllm import Client, LlamaServer
|
|
143
|
+
|
|
144
|
+
tools = [{
|
|
145
|
+
"type": "function",
|
|
146
|
+
"function": {
|
|
147
|
+
"name": "get_weather",
|
|
148
|
+
"description": "Get weather for a location",
|
|
149
|
+
"parameters": {
|
|
150
|
+
"type": "object",
|
|
151
|
+
"properties": {
|
|
152
|
+
"location": {"type": "string"}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}]
|
|
157
|
+
|
|
158
|
+
with LlamaServer("model.gguf") as server:
|
|
159
|
+
client = Client(server.base_url)
|
|
160
|
+
|
|
161
|
+
response = client.chat.completions.create(
|
|
162
|
+
model="llama",
|
|
163
|
+
messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
164
|
+
tools=tools
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if response.choices[0]["message"].get("tool_calls"):
|
|
168
|
+
print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Requirements
|
|
172
|
+
|
|
173
|
+
- Python 3.8+
|
|
174
|
+
- Vulkan SDK (for Vulkan backend)
|
|
175
|
+
- CUDA Toolkit (for CUDA backend)
|
|
176
|
+
- ROCm (for ROCm backend)
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
MIT License - same as llama.cpp
|
|
181
|
+
|
|
182
|
+
## Links
|
|
183
|
+
|
|
184
|
+
- [llama.cpp](https://github.com/ggml-org/llama.cpp)
|
|
185
|
+
- [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
|
|
186
|
+
- [Issues](https://github.com/ggml-org/llama.cpp/issues)
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: moxing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python wrapper for llama.cpp - OpenAI API compatible LLM backend with auto GPU detection
|
|
5
|
+
Author: llama.cpp community
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
|
|
8
|
+
Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
|
|
9
|
+
Project-URL: Repository, https://github.com/ggml-org/llama.cpp
|
|
10
|
+
Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
|
|
11
|
+
Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
Requires-Dist: httpx>=0.24.0
|
|
28
|
+
Requires-Dist: pydantic>=2.0.0
|
|
29
|
+
Requires-Dist: rich>=13.0.0
|
|
30
|
+
Requires-Dist: typer>=0.9.0
|
|
31
|
+
Requires-Dist: psutil>=5.9.0
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
35
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
36
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
37
|
+
Provides-Extra: openai
|
|
38
|
+
Requires-Dist: openai>=1.0.0; extra == "openai"
|
|
39
|
+
Provides-Extra: hf
|
|
40
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
|
|
41
|
+
Provides-Extra: modelscope
|
|
42
|
+
Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
|
|
43
|
+
Provides-Extra: all
|
|
44
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
45
|
+
Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
|
|
46
|
+
Requires-Dist: modelscope>=1.10.0; extra == "all"
|
|
47
|
+
|
|
48
|
+
# moxing (模型)
|
|
49
|
+
|
|
50
|
+
Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
|
|
51
|
+
|
|
52
|
+
**moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
- **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
|
|
57
|
+
- **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
|
|
58
|
+
- **OpenAI API Compatible**: Drop-in replacement for OpenAI API
|
|
59
|
+
- **Function Calling**: Support for tools and function calling
|
|
60
|
+
- **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
|
|
61
|
+
- **Benchmark**: Measure tokens/second performance like ollama
|
|
62
|
+
|
|
63
|
+
## Installation
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
pip install moxing
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
### Command Line
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# List available GPUs
|
|
75
|
+
pyllm devices
|
|
76
|
+
|
|
77
|
+
# Run inference with a model
|
|
78
|
+
pyllm run ./model.gguf -p "Hello, world!"
|
|
79
|
+
|
|
80
|
+
# Quick speed test
|
|
81
|
+
pyllm speed ./model.gguf
|
|
82
|
+
|
|
83
|
+
# Benchmark performance
|
|
84
|
+
pyllm bench ./model.gguf
|
|
85
|
+
|
|
86
|
+
# Download a model
|
|
87
|
+
pyllm download llama-3.2-3b -q Q4_K_M
|
|
88
|
+
|
|
89
|
+
# Start an OpenAI-compatible server
|
|
90
|
+
pyllm serve llama-3.2-3b -p 8080
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Python API
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from pyllm import quick_run, quick_server, Client
|
|
97
|
+
|
|
98
|
+
# Quick inference
|
|
99
|
+
result = quick_run("llama-3.2-3b", "Write a haiku about coding")
|
|
100
|
+
print(result)
|
|
101
|
+
|
|
102
|
+
# Start server with auto-configuration
|
|
103
|
+
with quick_server("llama-3.2-3b") as server:
|
|
104
|
+
client = Client(server.base_url)
|
|
105
|
+
|
|
106
|
+
response = client.chat.completions.create(
|
|
107
|
+
model="llama",
|
|
108
|
+
messages=[{"role": "user", "content": "Hello!"}]
|
|
109
|
+
)
|
|
110
|
+
print(response.choices[0]["message"]["content"])
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### Auto GPU Detection
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from pyllm import DeviceDetector, AutoRunner
|
|
117
|
+
|
|
118
|
+
# Detect available GPUs
|
|
119
|
+
detector = DeviceDetector()
|
|
120
|
+
devices = detector.detect()
|
|
121
|
+
for device in devices:
|
|
122
|
+
print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
|
|
123
|
+
|
|
124
|
+
# Get optimal configuration
|
|
125
|
+
config = detector.get_best_device(model_size_gb=5.0)
|
|
126
|
+
print(f"Best device: {config.device.name}")
|
|
127
|
+
print(f"Recommended GPU layers: {config.n_gpu_layers}")
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Model Download
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
from pyllm import ModelDownloader
|
|
134
|
+
|
|
135
|
+
downloader = ModelDownloader()
|
|
136
|
+
|
|
137
|
+
# Download from HuggingFace
|
|
138
|
+
path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
|
|
139
|
+
|
|
140
|
+
# Download from ModelScope
|
|
141
|
+
path = downloader.download(
|
|
142
|
+
"LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
|
|
143
|
+
"Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
|
|
144
|
+
source="modelscope"
|
|
145
|
+
)
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
## CLI Commands
|
|
149
|
+
|
|
150
|
+
| Command | Description |
|
|
151
|
+
|---------|-------------|
|
|
152
|
+
| `pyllm serve` | Start OpenAI-compatible server |
|
|
153
|
+
| `pyllm run` | Run inference with a model |
|
|
154
|
+
| `pyllm chat` | Interactive chat with a model |
|
|
155
|
+
| `pyllm bench` | Benchmark model performance |
|
|
156
|
+
| `pyllm speed` | Quick speed test |
|
|
157
|
+
| `pyllm info` | Show model info and estimates |
|
|
158
|
+
| `pyllm download` | Download a model |
|
|
159
|
+
| `pyllm models` | List available models |
|
|
160
|
+
| `pyllm devices` | List GPU devices |
|
|
161
|
+
| `pyllm config` | Show optimal configuration |
|
|
162
|
+
| `pyllm diagnose` | Diagnose system setup |
|
|
163
|
+
|
|
164
|
+
## Popular Models
|
|
165
|
+
|
|
166
|
+
| Name | Description | Sizes |
|
|
167
|
+
|------|-------------|-------|
|
|
168
|
+
| llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
169
|
+
| llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
170
|
+
| qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
171
|
+
| gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
|
|
172
|
+
| mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
|
|
173
|
+
| phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
|
|
174
|
+
| deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
|
|
175
|
+
|
|
176
|
+
## GPU Backends
|
|
177
|
+
|
|
178
|
+
| Backend | Platforms | Description |
|
|
179
|
+
|---------|-----------|-------------|
|
|
180
|
+
| Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
|
|
181
|
+
| CUDA | Windows, Linux | NVIDIA GPUs |
|
|
182
|
+
| ROCm | Linux | AMD GPUs |
|
|
183
|
+
| Metal | macOS | Apple Silicon |
|
|
184
|
+
| CPU | All | Fallback, no GPU required |
|
|
185
|
+
|
|
186
|
+
## Function Calling
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
from pyllm import Client, LlamaServer
|
|
190
|
+
|
|
191
|
+
tools = [{
|
|
192
|
+
"type": "function",
|
|
193
|
+
"function": {
|
|
194
|
+
"name": "get_weather",
|
|
195
|
+
"description": "Get weather for a location",
|
|
196
|
+
"parameters": {
|
|
197
|
+
"type": "object",
|
|
198
|
+
"properties": {
|
|
199
|
+
"location": {"type": "string"}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}]
|
|
204
|
+
|
|
205
|
+
with LlamaServer("model.gguf") as server:
|
|
206
|
+
client = Client(server.base_url)
|
|
207
|
+
|
|
208
|
+
response = client.chat.completions.create(
|
|
209
|
+
model="llama",
|
|
210
|
+
messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
|
|
211
|
+
tools=tools
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
if response.choices[0]["message"].get("tool_calls"):
|
|
215
|
+
print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Requirements
|
|
219
|
+
|
|
220
|
+
- Python 3.8+
|
|
221
|
+
- Vulkan SDK (for Vulkan backend)
|
|
222
|
+
- CUDA Toolkit (for CUDA backend)
|
|
223
|
+
- ROCm (for ROCm backend)
|
|
224
|
+
|
|
225
|
+
## License
|
|
226
|
+
|
|
227
|
+
MIT License - same as llama.cpp
|
|
228
|
+
|
|
229
|
+
## Links
|
|
230
|
+
|
|
231
|
+
- [llama.cpp](https://github.com/ggml-org/llama.cpp)
|
|
232
|
+
- [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
|
|
233
|
+
- [Issues](https://github.com/ggml-org/llama.cpp/issues)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
moxing.egg-info/PKG-INFO
|
|
5
|
+
moxing.egg-info/SOURCES.txt
|
|
6
|
+
moxing.egg-info/dependency_links.txt
|
|
7
|
+
moxing.egg-info/entry_points.txt
|
|
8
|
+
moxing.egg-info/requires.txt
|
|
9
|
+
moxing.egg-info/top_level.txt
|
|
10
|
+
pyllm/__init__.py
|
|
11
|
+
pyllm/benchmark.py
|
|
12
|
+
pyllm/binaries.py
|
|
13
|
+
pyllm/cli.py
|
|
14
|
+
pyllm/client.py
|
|
15
|
+
pyllm/device.py
|
|
16
|
+
pyllm/models.py
|
|
17
|
+
pyllm/runner.py
|
|
18
|
+
pyllm/server.py
|
|
19
|
+
pyllm/bin/darwin/.gitkeep
|
|
20
|
+
pyllm/bin/linux/.gitkeep
|
|
21
|
+
pyllm/bin/windows/.gitkeep
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|