moxing 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,233 @@
1
+ Metadata-Version: 2.4
2
+ Name: moxing
3
+ Version: 0.1.0
4
+ Summary: Python wrapper for llama.cpp - OpenAI API compatible LLM backend with auto GPU detection
5
+ Author: llama.cpp community
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
8
+ Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
9
+ Project-URL: Repository, https://github.com/ggml-org/llama.cpp
10
+ Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
11
+ Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: httpx>=0.24.0
28
+ Requires-Dist: pydantic>=2.0.0
29
+ Requires-Dist: rich>=13.0.0
30
+ Requires-Dist: typer>=0.9.0
31
+ Requires-Dist: psutil>=5.9.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: build>=1.0.0; extra == "dev"
36
+ Requires-Dist: twine>=4.0.0; extra == "dev"
37
+ Provides-Extra: openai
38
+ Requires-Dist: openai>=1.0.0; extra == "openai"
39
+ Provides-Extra: hf
40
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
41
+ Provides-Extra: modelscope
42
+ Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
43
+ Provides-Extra: all
44
+ Requires-Dist: openai>=1.0.0; extra == "all"
45
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
46
+ Requires-Dist: modelscope>=1.10.0; extra == "all"
47
+
48
+ # moxing (模型)
49
+
50
+ Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
51
+
52
+ **moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
53
+
54
+ ## Features
55
+
56
+ - **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
57
+ - **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
58
+ - **OpenAI API Compatible**: Drop-in replacement for OpenAI API
59
+ - **Function Calling**: Support for tools and function calling
60
+ - **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
61
+ - **Benchmark**: Measure tokens/second performance like ollama
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install moxing
67
+ ```
68
+
69
+ ## Quick Start
70
+
71
+ ### Command Line
72
+
73
+ ```bash
74
+ # List available GPUs
75
+ pyllm devices
76
+
77
+ # Run inference with a model
78
+ pyllm run ./model.gguf -p "Hello, world!"
79
+
80
+ # Quick speed test
81
+ pyllm speed ./model.gguf
82
+
83
+ # Benchmark performance
84
+ pyllm bench ./model.gguf
85
+
86
+ # Download a model
87
+ pyllm download llama-3.2-3b -q Q4_K_M
88
+
89
+ # Start an OpenAI-compatible server
90
+ pyllm serve llama-3.2-3b -p 8080
91
+ ```
92
+
93
+ ### Python API
94
+
95
+ ```python
96
+ from pyllm import quick_run, quick_server, Client
97
+
98
+ # Quick inference
99
+ result = quick_run("llama-3.2-3b", "Write a haiku about coding")
100
+ print(result)
101
+
102
+ # Start server with auto-configuration
103
+ with quick_server("llama-3.2-3b") as server:
104
+ client = Client(server.base_url)
105
+
106
+ response = client.chat.completions.create(
107
+ model="llama",
108
+ messages=[{"role": "user", "content": "Hello!"}]
109
+ )
110
+ print(response.choices[0]["message"]["content"])
111
+ ```
112
+
113
+ ### Auto GPU Detection
114
+
115
+ ```python
116
+ from pyllm import DeviceDetector, AutoRunner
117
+
118
+ # Detect available GPUs
119
+ detector = DeviceDetector()
120
+ devices = detector.detect()
121
+ for device in devices:
122
+ print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
123
+
124
+ # Get optimal configuration
125
+ config = detector.get_best_device(model_size_gb=5.0)
126
+ print(f"Best device: {config.device.name}")
127
+ print(f"Recommended GPU layers: {config.n_gpu_layers}")
128
+ ```
129
+
130
+ ### Model Download
131
+
132
+ ```python
133
+ from pyllm import ModelDownloader
134
+
135
+ downloader = ModelDownloader()
136
+
137
+ # Download from HuggingFace
138
+ path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
139
+
140
+ # Download from ModelScope
141
+ path = downloader.download(
142
+ "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
143
+ "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
144
+ source="modelscope"
145
+ )
146
+ ```
147
+
148
+ ## CLI Commands
149
+
150
+ | Command | Description |
151
+ |---------|-------------|
152
+ | `pyllm serve` | Start OpenAI-compatible server |
153
+ | `pyllm run` | Run inference with a model |
154
+ | `pyllm chat` | Interactive chat with a model |
155
+ | `pyllm bench` | Benchmark model performance |
156
+ | `pyllm speed` | Quick speed test |
157
+ | `pyllm info` | Show model info and estimates |
158
+ | `pyllm download` | Download a model |
159
+ | `pyllm models` | List available models |
160
+ | `pyllm devices` | List GPU devices |
161
+ | `pyllm config` | Show optimal configuration |
162
+ | `pyllm diagnose` | Diagnose system setup |
163
+
164
+ ## Popular Models
165
+
166
+ | Name | Description | Sizes |
167
+ |------|-------------|-------|
168
+ | llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
169
+ | llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
170
+ | qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
171
+ | gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
172
+ | mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
173
+ | phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
174
+ | deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
175
+
176
+ ## GPU Backends
177
+
178
+ | Backend | Platforms | Description |
179
+ |---------|-----------|-------------|
180
+ | Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
181
+ | CUDA | Windows, Linux | NVIDIA GPUs |
182
+ | ROCm | Linux | AMD GPUs |
183
+ | Metal | macOS | Apple Silicon |
184
+ | CPU | All | Fallback, no GPU required |
185
+
186
+ ## Function Calling
187
+
188
+ ```python
189
+ from pyllm import Client, LlamaServer
190
+
191
+ tools = [{
192
+ "type": "function",
193
+ "function": {
194
+ "name": "get_weather",
195
+ "description": "Get weather for a location",
196
+ "parameters": {
197
+ "type": "object",
198
+ "properties": {
199
+ "location": {"type": "string"}
200
+ }
201
+ }
202
+ }
203
+ }]
204
+
205
+ with LlamaServer("model.gguf") as server:
206
+ client = Client(server.base_url)
207
+
208
+ response = client.chat.completions.create(
209
+ model="llama",
210
+ messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
211
+ tools=tools
212
+ )
213
+
214
+ if response.choices[0]["message"].get("tool_calls"):
215
+ print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
216
+ ```
217
+
218
+ ## Requirements
219
+
220
+ - Python 3.8+
221
+ - Vulkan SDK (for Vulkan backend)
222
+ - CUDA Toolkit (for CUDA backend)
223
+ - ROCm (for ROCm backend)
224
+
225
+ ## License
226
+
227
+ MIT License - same as llama.cpp
228
+
229
+ ## Links
230
+
231
+ - [llama.cpp](https://github.com/ggml-org/llama.cpp)
232
+ - [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
233
+ - [Issues](https://github.com/ggml-org/llama.cpp/issues)
@@ -0,0 +1,17 @@
1
+ pyllm/__init__.py,sha256=e7olt5uAPR67AgwNc-8T20Bw5W2ODyCtQCr_HKdcnVA,1789
2
+ pyllm/benchmark.py,sha256=ReB27S3o-olfSwOtvYU2a301c0GEllBqmU87a-IsiXU,13631
3
+ pyllm/binaries.py,sha256=3fCYT382qDHj2-MhY2mZb4iFLR8UfjssCzWuf2oOzqc,11939
4
+ pyllm/cli.py,sha256=BmYxokxqK-6NSicKPyjaJZdk2nLHp_selSCcjC0dumo,25153
5
+ pyllm/client.py,sha256=f_zJH-mGjukd3U76ntSRABgeLdd_dduK43m-jkHWJN8,5185
6
+ pyllm/device.py,sha256=cEsfMm7DYyH5jG1LqAqoVFU4YXpjff8kMwVc3PPhwK8,10698
7
+ pyllm/models.py,sha256=0KJE3Z9wEYBbRgFmbZMMD8N0nLCaTWNFmDvrXmiSnsw,18822
8
+ pyllm/runner.py,sha256=emScwl-zcXtbwj7J0xe6Dyov6QJm_mKrQXZOSY1vlkY,10114
9
+ pyllm/server.py,sha256=D6gl9Z9APCBLataG9tShOgdnjcT5QCopYMULjb2EZEI,8071
10
+ pyllm/bin/darwin/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ pyllm/bin/linux/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ pyllm/bin/windows/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ moxing-0.1.0.dist-info/METADATA,sha256=LO46cwLrH2hEUKlE8GuWjPWyLnGxfP2ChHMJ3SIPYwA,7230
14
+ moxing-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
15
+ moxing-0.1.0.dist-info/entry_points.txt,sha256=tk3NDRLksFK2lchSqSEgJ6Cby4kBVlNBOjYH5BGW5vg,41
16
+ moxing-0.1.0.dist-info/top_level.txt,sha256=tdBgv9nG64-rOMYgijsbM_MtDiFIest5VKrSV3PZhoI,6
17
+ moxing-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ moxing = pyllm.cli:app
@@ -0,0 +1 @@
1
+ pyllm
pyllm/__init__.py ADDED
@@ -0,0 +1,80 @@
1
+ """
2
+ pyllm - Python wrapper for llama.cpp server
3
+
4
+ Provides an OpenAI API compatible interface for running GGUF models
5
+ with GPU acceleration (Vulkan, CUDA, ROCm, Metal).
6
+
7
+ Features:
8
+ - Auto-detect best GPU device and backend
9
+ - Download models from HuggingFace and ModelScope
10
+ - OpenAI API compatible server
11
+ - Function calling / tool support
12
+ - Multimodal support
13
+
14
+ Quick start:
15
+ from pyllm import quick_run, quick_server
16
+
17
+ # Quick inference
18
+ result = quick_run("llama-3.2-3b", "Write a haiku")
19
+
20
+ # Start server
21
+ with quick_server("llama-3.2-3b") as server:
22
+ # Use OpenAI API at http://localhost:8080/v1
23
+ pass
24
+ """
25
+
26
+ __version__ = "0.1.0"
27
+
28
+ from pyllm.client import Client, ChatCompletion, Message
29
+ from pyllm.server import LlamaServer, ServerConfig, GPUInfo
30
+ from pyllm.device import (
31
+ Device, DeviceConfig, DeviceDetector, BackendType,
32
+ detect_best_backend, get_device_config
33
+ )
34
+ from pyllm.models import (
35
+ ModelDownloader, ModelInfo, ModelRegistry, download_model
36
+ )
37
+ from pyllm.runner import (
38
+ AutoRunner, RunConfig, quick_run, quick_server
39
+ )
40
+ from pyllm.binaries import (
41
+ BinaryManager, get_binary_manager, ensure_binaries, get_server_binary
42
+ )
43
+
44
+ __all__ = [
45
+ # Client
46
+ "Client",
47
+ "ChatCompletion",
48
+ "Message",
49
+
50
+ # Server
51
+ "LlamaServer",
52
+ "ServerConfig",
53
+ "GPUInfo",
54
+
55
+ # Device
56
+ "Device",
57
+ "DeviceConfig",
58
+ "DeviceDetector",
59
+ "BackendType",
60
+ "detect_best_backend",
61
+ "get_device_config",
62
+
63
+ # Models
64
+ "ModelDownloader",
65
+ "ModelInfo",
66
+ "ModelRegistry",
67
+ "download_model",
68
+
69
+ # Runner
70
+ "AutoRunner",
71
+ "RunConfig",
72
+ "quick_run",
73
+ "quick_server",
74
+
75
+ # Binaries
76
+ "BinaryManager",
77
+ "get_binary_manager",
78
+ "ensure_binaries",
79
+ "get_server_binary",
80
+ ]