moxing 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include pyproject.toml
3
+ include LICENSE
4
+ recursive-include pyllm *.py
5
+ recursive-exclude pyllm/bin *
6
+ include pyllm/bin/windows/.gitkeep
7
+ include pyllm/bin/linux/.gitkeep
8
+ include pyllm/bin/darwin/.gitkeep
moxing-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,233 @@
1
+ Metadata-Version: 2.4
2
+ Name: moxing
3
+ Version: 0.1.0
4
+ Summary: Python wrapper for llama.cpp - OpenAI API compatible LLM backend with auto GPU detection
5
+ Author: llama.cpp community
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
8
+ Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
9
+ Project-URL: Repository, https://github.com/ggml-org/llama.cpp
10
+ Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
11
+ Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: httpx>=0.24.0
28
+ Requires-Dist: pydantic>=2.0.0
29
+ Requires-Dist: rich>=13.0.0
30
+ Requires-Dist: typer>=0.9.0
31
+ Requires-Dist: psutil>=5.9.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: build>=1.0.0; extra == "dev"
36
+ Requires-Dist: twine>=4.0.0; extra == "dev"
37
+ Provides-Extra: openai
38
+ Requires-Dist: openai>=1.0.0; extra == "openai"
39
+ Provides-Extra: hf
40
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
41
+ Provides-Extra: modelscope
42
+ Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
43
+ Provides-Extra: all
44
+ Requires-Dist: openai>=1.0.0; extra == "all"
45
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
46
+ Requires-Dist: modelscope>=1.10.0; extra == "all"
47
+
48
+ # moxing (模型)
49
+
50
+ Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
51
+
52
+ **moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
53
+
54
+ ## Features
55
+
56
+ - **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
57
+ - **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
58
+ - **OpenAI API Compatible**: Drop-in replacement for OpenAI API
59
+ - **Function Calling**: Support for tools and function calling
60
+ - **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
61
+ - **Benchmark**: Measure tokens/second performance like ollama
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install moxing
67
+ ```
68
+
69
+ ## Quick Start
70
+
71
+ ### Command Line
72
+
73
+ ```bash
74
+ # List available GPUs
75
+ pyllm devices
76
+
77
+ # Run inference with a model
78
+ pyllm run ./model.gguf -p "Hello, world!"
79
+
80
+ # Quick speed test
81
+ pyllm speed ./model.gguf
82
+
83
+ # Benchmark performance
84
+ pyllm bench ./model.gguf
85
+
86
+ # Download a model
87
+ pyllm download llama-3.2-3b -q Q4_K_M
88
+
89
+ # Start an OpenAI-compatible server
90
+ pyllm serve llama-3.2-3b -p 8080
91
+ ```
92
+
93
+ ### Python API
94
+
95
+ ```python
96
+ from pyllm import quick_run, quick_server, Client
97
+
98
+ # Quick inference
99
+ result = quick_run("llama-3.2-3b", "Write a haiku about coding")
100
+ print(result)
101
+
102
+ # Start server with auto-configuration
103
+ with quick_server("llama-3.2-3b") as server:
104
+ client = Client(server.base_url)
105
+
106
+ response = client.chat.completions.create(
107
+ model="llama",
108
+ messages=[{"role": "user", "content": "Hello!"}]
109
+ )
110
+ print(response.choices[0]["message"]["content"])
111
+ ```
112
+
113
+ ### Auto GPU Detection
114
+
115
+ ```python
116
+ from pyllm import DeviceDetector, AutoRunner
117
+
118
+ # Detect available GPUs
119
+ detector = DeviceDetector()
120
+ devices = detector.detect()
121
+ for device in devices:
122
+ print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
123
+
124
+ # Get optimal configuration
125
+ config = detector.get_best_device(model_size_gb=5.0)
126
+ print(f"Best device: {config.device.name}")
127
+ print(f"Recommended GPU layers: {config.n_gpu_layers}")
128
+ ```
129
+
130
+ ### Model Download
131
+
132
+ ```python
133
+ from pyllm import ModelDownloader
134
+
135
+ downloader = ModelDownloader()
136
+
137
+ # Download from HuggingFace
138
+ path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
139
+
140
+ # Download from ModelScope
141
+ path = downloader.download(
142
+ "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
143
+ "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
144
+ source="modelscope"
145
+ )
146
+ ```
147
+
148
+ ## CLI Commands
149
+
150
+ | Command | Description |
151
+ |---------|-------------|
152
+ | `pyllm serve` | Start OpenAI-compatible server |
153
+ | `pyllm run` | Run inference with a model |
154
+ | `pyllm chat` | Interactive chat with a model |
155
+ | `pyllm bench` | Benchmark model performance |
156
+ | `pyllm speed` | Quick speed test |
157
+ | `pyllm info` | Show model info and estimates |
158
+ | `pyllm download` | Download a model |
159
+ | `pyllm models` | List available models |
160
+ | `pyllm devices` | List GPU devices |
161
+ | `pyllm config` | Show optimal configuration |
162
+ | `pyllm diagnose` | Diagnose system setup |
163
+
164
+ ## Popular Models
165
+
166
+ | Name | Description | Sizes |
167
+ |------|-------------|-------|
168
+ | llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
169
+ | llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
170
+ | qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
171
+ | gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
172
+ | mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
173
+ | phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
174
+ | deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
175
+
176
+ ## GPU Backends
177
+
178
+ | Backend | Platforms | Description |
179
+ |---------|-----------|-------------|
180
+ | Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
181
+ | CUDA | Windows, Linux | NVIDIA GPUs |
182
+ | ROCm | Linux | AMD GPUs |
183
+ | Metal | macOS | Apple Silicon |
184
+ | CPU | All | Fallback, no GPU required |
185
+
186
+ ## Function Calling
187
+
188
+ ```python
189
+ from pyllm import Client, LlamaServer
190
+
191
+ tools = [{
192
+ "type": "function",
193
+ "function": {
194
+ "name": "get_weather",
195
+ "description": "Get weather for a location",
196
+ "parameters": {
197
+ "type": "object",
198
+ "properties": {
199
+ "location": {"type": "string"}
200
+ }
201
+ }
202
+ }
203
+ }]
204
+
205
+ with LlamaServer("model.gguf") as server:
206
+ client = Client(server.base_url)
207
+
208
+ response = client.chat.completions.create(
209
+ model="llama",
210
+ messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
211
+ tools=tools
212
+ )
213
+
214
+ if response.choices[0]["message"].get("tool_calls"):
215
+ print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
216
+ ```
217
+
218
+ ## Requirements
219
+
220
+ - Python 3.8+
221
+ - Vulkan SDK (for Vulkan backend)
222
+ - CUDA Toolkit (for CUDA backend)
223
+ - ROCm (for ROCm backend)
224
+
225
+ ## License
226
+
227
+ MIT License - same as llama.cpp
228
+
229
+ ## Links
230
+
231
+ - [llama.cpp](https://github.com/ggml-org/llama.cpp)
232
+ - [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
233
+ - [Issues](https://github.com/ggml-org/llama.cpp/issues)
moxing-0.1.0/README.md ADDED
@@ -0,0 +1,186 @@
1
+ # moxing (模型)
2
+
3
+ Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
4
+
5
+ **moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
6
+
7
+ ## Features
8
+
9
+ - **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
10
+ - **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
11
+ - **OpenAI API Compatible**: Drop-in replacement for OpenAI API
12
+ - **Function Calling**: Support for tools and function calling
13
+ - **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
14
+ - **Benchmark**: Measure tokens/second performance like ollama
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install moxing
20
+ ```
21
+
22
+ ## Quick Start
23
+
24
+ ### Command Line
25
+
26
+ ```bash
27
+ # List available GPUs
28
+ pyllm devices
29
+
30
+ # Run inference with a model
31
+ pyllm run ./model.gguf -p "Hello, world!"
32
+
33
+ # Quick speed test
34
+ pyllm speed ./model.gguf
35
+
36
+ # Benchmark performance
37
+ pyllm bench ./model.gguf
38
+
39
+ # Download a model
40
+ pyllm download llama-3.2-3b -q Q4_K_M
41
+
42
+ # Start an OpenAI-compatible server
43
+ pyllm serve llama-3.2-3b -p 8080
44
+ ```
45
+
46
+ ### Python API
47
+
48
+ ```python
49
+ from pyllm import quick_run, quick_server, Client
50
+
51
+ # Quick inference
52
+ result = quick_run("llama-3.2-3b", "Write a haiku about coding")
53
+ print(result)
54
+
55
+ # Start server with auto-configuration
56
+ with quick_server("llama-3.2-3b") as server:
57
+ client = Client(server.base_url)
58
+
59
+ response = client.chat.completions.create(
60
+ model="llama",
61
+ messages=[{"role": "user", "content": "Hello!"}]
62
+ )
63
+ print(response.choices[0]["message"]["content"])
64
+ ```
65
+
66
+ ### Auto GPU Detection
67
+
68
+ ```python
69
+ from pyllm import DeviceDetector, AutoRunner
70
+
71
+ # Detect available GPUs
72
+ detector = DeviceDetector()
73
+ devices = detector.detect()
74
+ for device in devices:
75
+ print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
76
+
77
+ # Get optimal configuration
78
+ config = detector.get_best_device(model_size_gb=5.0)
79
+ print(f"Best device: {config.device.name}")
80
+ print(f"Recommended GPU layers: {config.n_gpu_layers}")
81
+ ```
82
+
83
+ ### Model Download
84
+
85
+ ```python
86
+ from pyllm import ModelDownloader
87
+
88
+ downloader = ModelDownloader()
89
+
90
+ # Download from HuggingFace
91
+ path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
92
+
93
+ # Download from ModelScope
94
+ path = downloader.download(
95
+ "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
96
+ "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
97
+ source="modelscope"
98
+ )
99
+ ```
100
+
101
+ ## CLI Commands
102
+
103
+ | Command | Description |
104
+ |---------|-------------|
105
+ | `pyllm serve` | Start OpenAI-compatible server |
106
+ | `pyllm run` | Run inference with a model |
107
+ | `pyllm chat` | Interactive chat with a model |
108
+ | `pyllm bench` | Benchmark model performance |
109
+ | `pyllm speed` | Quick speed test |
110
+ | `pyllm info` | Show model info and estimates |
111
+ | `pyllm download` | Download a model |
112
+ | `pyllm models` | List available models |
113
+ | `pyllm devices` | List GPU devices |
114
+ | `pyllm config` | Show optimal configuration |
115
+ | `pyllm diagnose` | Diagnose system setup |
116
+
117
+ ## Popular Models
118
+
119
+ | Name | Description | Sizes |
120
+ |------|-------------|-------|
121
+ | llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
122
+ | llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
123
+ | qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
124
+ | gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
125
+ | mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
126
+ | phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
127
+ | deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
128
+
129
+ ## GPU Backends
130
+
131
+ | Backend | Platforms | Description |
132
+ |---------|-----------|-------------|
133
+ | Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
134
+ | CUDA | Windows, Linux | NVIDIA GPUs |
135
+ | ROCm | Linux | AMD GPUs |
136
+ | Metal | macOS | Apple Silicon |
137
+ | CPU | All | Fallback, no GPU required |
138
+
139
+ ## Function Calling
140
+
141
+ ```python
142
+ from pyllm import Client, LlamaServer
143
+
144
+ tools = [{
145
+ "type": "function",
146
+ "function": {
147
+ "name": "get_weather",
148
+ "description": "Get weather for a location",
149
+ "parameters": {
150
+ "type": "object",
151
+ "properties": {
152
+ "location": {"type": "string"}
153
+ }
154
+ }
155
+ }
156
+ }]
157
+
158
+ with LlamaServer("model.gguf") as server:
159
+ client = Client(server.base_url)
160
+
161
+ response = client.chat.completions.create(
162
+ model="llama",
163
+ messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
164
+ tools=tools
165
+ )
166
+
167
+ if response.choices[0]["message"].get("tool_calls"):
168
+ print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
169
+ ```
170
+
171
+ ## Requirements
172
+
173
+ - Python 3.8+
174
+ - Vulkan SDK (for Vulkan backend)
175
+ - CUDA Toolkit (for CUDA backend)
176
+ - ROCm (for ROCm backend)
177
+
178
+ ## License
179
+
180
+ MIT License - same as llama.cpp
181
+
182
+ ## Links
183
+
184
+ - [llama.cpp](https://github.com/ggml-org/llama.cpp)
185
+ - [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
186
+ - [Issues](https://github.com/ggml-org/llama.cpp/issues)
@@ -0,0 +1,233 @@
1
+ Metadata-Version: 2.4
2
+ Name: moxing
3
+ Version: 0.1.0
4
+ Summary: Python wrapper for llama.cpp - OpenAI API compatible LLM backend with auto GPU detection
5
+ Author: llama.cpp community
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ggml-org/llama.cpp
8
+ Project-URL: Documentation, https://github.com/ggml-org/llama.cpp/tree/master/tools/server
9
+ Project-URL: Repository, https://github.com/ggml-org/llama.cpp
10
+ Project-URL: Issues, https://github.com/ggml-org/llama.cpp/issues
11
+ Keywords: llama,llama.cpp,gguf,openai,api,gpu,vulkan,cuda,ai,llm
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.8
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ Requires-Dist: httpx>=0.24.0
28
+ Requires-Dist: pydantic>=2.0.0
29
+ Requires-Dist: rich>=13.0.0
30
+ Requires-Dist: typer>=0.9.0
31
+ Requires-Dist: psutil>=5.9.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: build>=1.0.0; extra == "dev"
36
+ Requires-Dist: twine>=4.0.0; extra == "dev"
37
+ Provides-Extra: openai
38
+ Requires-Dist: openai>=1.0.0; extra == "openai"
39
+ Provides-Extra: hf
40
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "hf"
41
+ Provides-Extra: modelscope
42
+ Requires-Dist: modelscope>=1.10.0; extra == "modelscope"
43
+ Provides-Extra: all
44
+ Requires-Dist: openai>=1.0.0; extra == "all"
45
+ Requires-Dist: huggingface_hub>=0.20.0; extra == "all"
46
+ Requires-Dist: modelscope>=1.10.0; extra == "all"
47
+
48
+ # moxing (模型)
49
+
50
+ Python wrapper for llama.cpp - OpenAI API compatible LLM backend with automatic GPU detection and model downloading.
51
+
52
+ **moxing** (模型) means "model" in Chinese. A simple, unified interface for running LLMs locally.
53
+
54
+ ## Features
55
+
56
+ - **Auto GPU Detection**: Automatically detects and configures the best GPU backend (Vulkan, CUDA, ROCm, Metal)
57
+ - **Model Downloading**: Download GGUF models from HuggingFace and ModelScope
58
+ - **OpenAI API Compatible**: Drop-in replacement for OpenAI API
59
+ - **Function Calling**: Support for tools and function calling
60
+ - **Pre-built Binaries**: Automatically downloads pre-built llama.cpp binaries
61
+ - **Benchmark**: Measure tokens/second performance like ollama
62
+
63
+ ## Installation
64
+
65
+ ```bash
66
+ pip install moxing
67
+ ```
68
+
69
+ ## Quick Start
70
+
71
+ ### Command Line
72
+
73
+ ```bash
74
+ # List available GPUs
75
+ pyllm devices
76
+
77
+ # Run inference with a model
78
+ pyllm run ./model.gguf -p "Hello, world!"
79
+
80
+ # Quick speed test
81
+ pyllm speed ./model.gguf
82
+
83
+ # Benchmark performance
84
+ pyllm bench ./model.gguf
85
+
86
+ # Download a model
87
+ pyllm download llama-3.2-3b -q Q4_K_M
88
+
89
+ # Start an OpenAI-compatible server
90
+ pyllm serve llama-3.2-3b -p 8080
91
+ ```
92
+
93
+ ### Python API
94
+
95
+ ```python
96
+ from pyllm import quick_run, quick_server, Client
97
+
98
+ # Quick inference
99
+ result = quick_run("llama-3.2-3b", "Write a haiku about coding")
100
+ print(result)
101
+
102
+ # Start server with auto-configuration
103
+ with quick_server("llama-3.2-3b") as server:
104
+ client = Client(server.base_url)
105
+
106
+ response = client.chat.completions.create(
107
+ model="llama",
108
+ messages=[{"role": "user", "content": "Hello!"}]
109
+ )
110
+ print(response.choices[0]["message"]["content"])
111
+ ```
112
+
113
+ ### Auto GPU Detection
114
+
115
+ ```python
116
+ from pyllm import DeviceDetector, AutoRunner
117
+
118
+ # Detect available GPUs
119
+ detector = DeviceDetector()
120
+ devices = detector.detect()
121
+ for device in devices:
122
+ print(f"{device.name} ({device.backend.value}, {device.memory_gb:.1f}GB)")
123
+
124
+ # Get optimal configuration
125
+ config = detector.get_best_device(model_size_gb=5.0)
126
+ print(f"Best device: {config.device.name}")
127
+ print(f"Recommended GPU layers: {config.n_gpu_layers}")
128
+ ```
129
+
130
+ ### Model Download
131
+
132
+ ```python
133
+ from pyllm import ModelDownloader
134
+
135
+ downloader = ModelDownloader()
136
+
137
+ # Download from HuggingFace
138
+ path = downloader.download("Qwen/Qwen2.5-7B-Instruct-GGUF", "Q4_K_M.gguf")
139
+
140
+ # Download from ModelScope
141
+ path = downloader.download(
142
+ "LLM-Research/Meta-Llama-3-8B-Instruct-GGUF",
143
+ "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf",
144
+ source="modelscope"
145
+ )
146
+ ```
147
+
148
+ ## CLI Commands
149
+
150
+ | Command | Description |
151
+ |---------|-------------|
152
+ | `pyllm serve` | Start OpenAI-compatible server |
153
+ | `pyllm run` | Run inference with a model |
154
+ | `pyllm chat` | Interactive chat with a model |
155
+ | `pyllm bench` | Benchmark model performance |
156
+ | `pyllm speed` | Quick speed test |
157
+ | `pyllm info` | Show model info and estimates |
158
+ | `pyllm download` | Download a model |
159
+ | `pyllm models` | List available models |
160
+ | `pyllm devices` | List GPU devices |
161
+ | `pyllm config` | Show optimal configuration |
162
+ | `pyllm diagnose` | Diagnose system setup |
163
+
164
+ ## Popular Models
165
+
166
+ | Name | Description | Sizes |
167
+ |------|-------------|-------|
168
+ | llama-3.2-3b | Llama 3.2 3B | Q4_K_M, Q5_K_M, Q8_0 |
169
+ | llama-3.1-8b | Llama 3.1 8B | Q4_K_M, Q5_K_M, Q8_0 |
170
+ | qwen2.5-7b | Qwen 2.5 7B | Q4_K_M, Q5_K_M, Q8_0 |
171
+ | gemma-2-9b | Gemma 2 9B | Q4_K_M, Q5_K_M, Q8_0 |
172
+ | mistral-7b | Mistral 7B v0.3 | Q4_K_M, Q5_K_M, Q8_0 |
173
+ | phi-3.5-mini | Phi 3.5 Mini | Q4_K_M, Q5_K_M, Q8_0 |
174
+ | deepseek-coder-6.7b | DeepSeek Coder | Q4_K_M, Q5_K_M, Q8_0 |
175
+
176
+ ## GPU Backends
177
+
178
+ | Backend | Platforms | Description |
179
+ |---------|-----------|-------------|
180
+ | Vulkan | Windows, Linux | Cross-platform GPU API, works on AMD, Intel, NVIDIA |
181
+ | CUDA | Windows, Linux | NVIDIA GPUs |
182
+ | ROCm | Linux | AMD GPUs |
183
+ | Metal | macOS | Apple Silicon |
184
+ | CPU | All | Fallback, no GPU required |
185
+
186
+ ## Function Calling
187
+
188
+ ```python
189
+ from pyllm import Client, LlamaServer
190
+
191
+ tools = [{
192
+ "type": "function",
193
+ "function": {
194
+ "name": "get_weather",
195
+ "description": "Get weather for a location",
196
+ "parameters": {
197
+ "type": "object",
198
+ "properties": {
199
+ "location": {"type": "string"}
200
+ }
201
+ }
202
+ }
203
+ }]
204
+
205
+ with LlamaServer("model.gguf") as server:
206
+ client = Client(server.base_url)
207
+
208
+ response = client.chat.completions.create(
209
+ model="llama",
210
+ messages=[{"role": "user", "content": "What's the weather in Tokyo?"}],
211
+ tools=tools
212
+ )
213
+
214
+ if response.choices[0]["message"].get("tool_calls"):
215
+ print("Model wants to call:", response.choices[0]["message"]["tool_calls"])
216
+ ```
217
+
218
+ ## Requirements
219
+
220
+ - Python 3.8+
221
+ - Vulkan SDK (for Vulkan backend)
222
+ - CUDA Toolkit (for CUDA backend)
223
+ - ROCm (for ROCm backend)
224
+
225
+ ## License
226
+
227
+ MIT License - same as llama.cpp
228
+
229
+ ## Links
230
+
231
+ - [llama.cpp](https://github.com/ggml-org/llama.cpp)
232
+ - [Documentation](https://github.com/ggml-org/llama.cpp/tree/master/tools/server)
233
+ - [Issues](https://github.com/ggml-org/llama.cpp/issues)
@@ -0,0 +1,21 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ moxing.egg-info/PKG-INFO
5
+ moxing.egg-info/SOURCES.txt
6
+ moxing.egg-info/dependency_links.txt
7
+ moxing.egg-info/entry_points.txt
8
+ moxing.egg-info/requires.txt
9
+ moxing.egg-info/top_level.txt
10
+ pyllm/__init__.py
11
+ pyllm/benchmark.py
12
+ pyllm/binaries.py
13
+ pyllm/cli.py
14
+ pyllm/client.py
15
+ pyllm/device.py
16
+ pyllm/models.py
17
+ pyllm/runner.py
18
+ pyllm/server.py
19
+ pyllm/bin/darwin/.gitkeep
20
+ pyllm/bin/linux/.gitkeep
21
+ pyllm/bin/windows/.gitkeep
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ moxing = pyllm.cli:app