EvoScientist 0.0.1.dev4__py3-none-any.whl → 0.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EvoScientist/EvoScientist.py +26 -62
- EvoScientist/__init__.py +0 -19
- EvoScientist/backends.py +0 -26
- EvoScientist/cli.py +1111 -498
- EvoScientist/middleware.py +8 -61
- EvoScientist/stream/__init__.py +0 -25
- EvoScientist/stream/utils.py +16 -23
- EvoScientist/tools.py +2 -75
- evoscientist-0.1.0rc1.dist-info/METADATA +199 -0
- evoscientist-0.1.0rc1.dist-info/RECORD +21 -0
- evoscientist-0.1.0rc1.dist-info/entry_points.txt +2 -0
- EvoScientist/config.py +0 -274
- EvoScientist/llm/__init__.py +0 -21
- EvoScientist/llm/models.py +0 -99
- EvoScientist/memory.py +0 -715
- EvoScientist/onboard.py +0 -725
- EvoScientist/paths.py +0 -44
- EvoScientist/skills/accelerate/SKILL.md +0 -332
- EvoScientist/skills/accelerate/references/custom-plugins.md +0 -453
- EvoScientist/skills/accelerate/references/megatron-integration.md +0 -489
- EvoScientist/skills/accelerate/references/performance.md +0 -525
- EvoScientist/skills/bitsandbytes/SKILL.md +0 -411
- EvoScientist/skills/bitsandbytes/references/memory-optimization.md +0 -521
- EvoScientist/skills/bitsandbytes/references/qlora-training.md +0 -521
- EvoScientist/skills/bitsandbytes/references/quantization-formats.md +0 -447
- EvoScientist/skills/find-skills/SKILL.md +0 -133
- EvoScientist/skills/find-skills/scripts/install_skill.py +0 -211
- EvoScientist/skills/flash-attention/SKILL.md +0 -367
- EvoScientist/skills/flash-attention/references/benchmarks.md +0 -215
- EvoScientist/skills/flash-attention/references/transformers-integration.md +0 -293
- EvoScientist/skills/llama-cpp/SKILL.md +0 -258
- EvoScientist/skills/llama-cpp/references/optimization.md +0 -89
- EvoScientist/skills/llama-cpp/references/quantization.md +0 -213
- EvoScientist/skills/llama-cpp/references/server.md +0 -125
- EvoScientist/skills/lm-evaluation-harness/SKILL.md +0 -490
- EvoScientist/skills/lm-evaluation-harness/references/api-evaluation.md +0 -490
- EvoScientist/skills/lm-evaluation-harness/references/benchmark-guide.md +0 -488
- EvoScientist/skills/lm-evaluation-harness/references/custom-tasks.md +0 -602
- EvoScientist/skills/lm-evaluation-harness/references/distributed-eval.md +0 -519
- EvoScientist/skills/ml-paper-writing/SKILL.md +0 -937
- EvoScientist/skills/ml-paper-writing/references/checklists.md +0 -361
- EvoScientist/skills/ml-paper-writing/references/citation-workflow.md +0 -562
- EvoScientist/skills/ml-paper-writing/references/reviewer-guidelines.md +0 -367
- EvoScientist/skills/ml-paper-writing/references/sources.md +0 -159
- EvoScientist/skills/ml-paper-writing/references/writing-guide.md +0 -476
- EvoScientist/skills/ml-paper-writing/templates/README.md +0 -251
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/README.md +0 -534
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-supp.tex +0 -144
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026-unified-template.tex +0 -952
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bib +0 -111
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.bst +0 -1493
- EvoScientist/skills/ml-paper-writing/templates/aaai2026/aaai2026.sty +0 -315
- EvoScientist/skills/ml-paper-writing/templates/acl/README.md +0 -50
- EvoScientist/skills/ml-paper-writing/templates/acl/acl.sty +0 -312
- EvoScientist/skills/ml-paper-writing/templates/acl/acl_latex.tex +0 -377
- EvoScientist/skills/ml-paper-writing/templates/acl/acl_lualatex.tex +0 -101
- EvoScientist/skills/ml-paper-writing/templates/acl/acl_natbib.bst +0 -1940
- EvoScientist/skills/ml-paper-writing/templates/acl/anthology.bib.txt +0 -26
- EvoScientist/skills/ml-paper-writing/templates/acl/custom.bib +0 -70
- EvoScientist/skills/ml-paper-writing/templates/acl/formatting.md +0 -326
- EvoScientist/skills/ml-paper-writing/templates/colm2025/README.md +0 -3
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bib +0 -11
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.bst +0 -1440
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.sty +0 -218
- EvoScientist/skills/ml-paper-writing/templates/colm2025/colm2025_conference.tex +0 -305
- EvoScientist/skills/ml-paper-writing/templates/colm2025/fancyhdr.sty +0 -485
- EvoScientist/skills/ml-paper-writing/templates/colm2025/math_commands.tex +0 -508
- EvoScientist/skills/ml-paper-writing/templates/colm2025/natbib.sty +0 -1246
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/fancyhdr.sty +0 -485
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bib +0 -24
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.bst +0 -1440
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.sty +0 -246
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/iclr2026_conference.tex +0 -414
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/math_commands.tex +0 -508
- EvoScientist/skills/ml-paper-writing/templates/iclr2026/natbib.sty +0 -1246
- EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithm.sty +0 -79
- EvoScientist/skills/ml-paper-writing/templates/icml2026/algorithmic.sty +0 -201
- EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.bib +0 -75
- EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/icml2026/example_paper.tex +0 -662
- EvoScientist/skills/ml-paper-writing/templates/icml2026/fancyhdr.sty +0 -864
- EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.bst +0 -1443
- EvoScientist/skills/ml-paper-writing/templates/icml2026/icml2026.sty +0 -767
- EvoScientist/skills/ml-paper-writing/templates/icml2026/icml_numpapers.pdf +0 -0
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/Makefile +0 -36
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/extra_pkgs.tex +0 -53
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/main.tex +0 -38
- EvoScientist/skills/ml-paper-writing/templates/neurips2025/neurips.sty +0 -382
- EvoScientist/skills/peft/SKILL.md +0 -431
- EvoScientist/skills/peft/references/advanced-usage.md +0 -514
- EvoScientist/skills/peft/references/troubleshooting.md +0 -480
- EvoScientist/skills/ray-data/SKILL.md +0 -326
- EvoScientist/skills/ray-data/references/integration.md +0 -82
- EvoScientist/skills/ray-data/references/transformations.md +0 -83
- EvoScientist/skills/skill-creator/LICENSE.txt +0 -202
- EvoScientist/skills/skill-creator/SKILL.md +0 -356
- EvoScientist/skills/skill-creator/references/output-patterns.md +0 -82
- EvoScientist/skills/skill-creator/references/workflows.md +0 -28
- EvoScientist/skills/skill-creator/scripts/init_skill.py +0 -303
- EvoScientist/skills/skill-creator/scripts/package_skill.py +0 -110
- EvoScientist/skills/skill-creator/scripts/quick_validate.py +0 -95
- EvoScientist/skills_manager.py +0 -391
- EvoScientist/stream/display.py +0 -604
- EvoScientist/stream/events.py +0 -415
- EvoScientist/stream/state.py +0 -343
- evoscientist-0.0.1.dev4.dist-info/METADATA +0 -367
- evoscientist-0.0.1.dev4.dist-info/RECORD +0 -117
- evoscientist-0.0.1.dev4.dist-info/entry_points.txt +0 -5
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc1.dist-info}/WHEEL +0 -0
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc1.dist-info}/licenses/LICENSE +0 -0
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -1,258 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: llama-cpp
|
|
3
|
-
description: Runs LLM inference on CPU, Apple Silicon, and consumer GPUs without NVIDIA hardware. Use for edge deployment, M1/M2/M3 Macs, AMD/Intel GPUs, or when CUDA is unavailable. Supports GGUF quantization (1.5-8 bit) for reduced memory and 4-10× speedup vs PyTorch on CPU.
|
|
4
|
-
version: 1.0.0
|
|
5
|
-
author: Orchestra Research
|
|
6
|
-
license: MIT
|
|
7
|
-
tags: [Inference Serving, Llama.cpp, CPU Inference, Apple Silicon, Edge Deployment, GGUF, Quantization, Non-NVIDIA, AMD GPUs, Intel GPUs, Embedded]
|
|
8
|
-
dependencies: [llama-cpp-python]
|
|
9
|
-
---
|
|
10
|
-
|
|
11
|
-
# llama.cpp
|
|
12
|
-
|
|
13
|
-
Pure C/C++ LLM inference with minimal dependencies, optimized for CPUs and non-NVIDIA hardware.
|
|
14
|
-
|
|
15
|
-
## When to use llama.cpp
|
|
16
|
-
|
|
17
|
-
**Use llama.cpp when:**
|
|
18
|
-
- Running on CPU-only machines
|
|
19
|
-
- Deploying on Apple Silicon (M1/M2/M3/M4)
|
|
20
|
-
- Using AMD or Intel GPUs (no CUDA)
|
|
21
|
-
- Edge deployment (Raspberry Pi, embedded systems)
|
|
22
|
-
- Need simple deployment without Docker/Python
|
|
23
|
-
|
|
24
|
-
**Use TensorRT-LLM instead when:**
|
|
25
|
-
- Have NVIDIA GPUs (A100/H100)
|
|
26
|
-
- Need maximum throughput (100K+ tok/s)
|
|
27
|
-
- Running in datacenter with CUDA
|
|
28
|
-
|
|
29
|
-
**Use vLLM instead when:**
|
|
30
|
-
- Have NVIDIA GPUs
|
|
31
|
-
- Need Python-first API
|
|
32
|
-
- Want PagedAttention
|
|
33
|
-
|
|
34
|
-
## Quick start
|
|
35
|
-
|
|
36
|
-
### Installation
|
|
37
|
-
|
|
38
|
-
```bash
|
|
39
|
-
# macOS/Linux
|
|
40
|
-
brew install llama.cpp
|
|
41
|
-
|
|
42
|
-
# Or build from source
|
|
43
|
-
git clone https://github.com/ggerganov/llama.cpp
|
|
44
|
-
cd llama.cpp
|
|
45
|
-
make
|
|
46
|
-
|
|
47
|
-
# With Metal (Apple Silicon)
|
|
48
|
-
make LLAMA_METAL=1
|
|
49
|
-
|
|
50
|
-
# With CUDA (NVIDIA)
|
|
51
|
-
make LLAMA_CUDA=1
|
|
52
|
-
|
|
53
|
-
# With ROCm (AMD)
|
|
54
|
-
make LLAMA_HIP=1
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
### Download model
|
|
58
|
-
|
|
59
|
-
```bash
|
|
60
|
-
# Download from HuggingFace (GGUF format)
|
|
61
|
-
huggingface-cli download \
|
|
62
|
-
TheBloke/Llama-2-7B-Chat-GGUF \
|
|
63
|
-
llama-2-7b-chat.Q4_K_M.gguf \
|
|
64
|
-
--local-dir models/
|
|
65
|
-
|
|
66
|
-
# Or convert from HuggingFace
|
|
67
|
-
python convert_hf_to_gguf.py models/llama-2-7b-chat/
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
### Run inference
|
|
71
|
-
|
|
72
|
-
```bash
|
|
73
|
-
# Simple chat
|
|
74
|
-
./llama-cli \
|
|
75
|
-
-m models/llama-2-7b-chat.Q4_K_M.gguf \
|
|
76
|
-
-p "Explain quantum computing" \
|
|
77
|
-
-n 256 # Max tokens
|
|
78
|
-
|
|
79
|
-
# Interactive chat
|
|
80
|
-
./llama-cli \
|
|
81
|
-
-m models/llama-2-7b-chat.Q4_K_M.gguf \
|
|
82
|
-
--interactive
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
### Server mode
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
# Start OpenAI-compatible server
|
|
89
|
-
./llama-server \
|
|
90
|
-
-m models/llama-2-7b-chat.Q4_K_M.gguf \
|
|
91
|
-
--host 0.0.0.0 \
|
|
92
|
-
--port 8080 \
|
|
93
|
-
-ngl 32 # Offload 32 layers to GPU
|
|
94
|
-
|
|
95
|
-
# Client request
|
|
96
|
-
curl http://localhost:8080/v1/chat/completions \
|
|
97
|
-
-H "Content-Type: application/json" \
|
|
98
|
-
-d '{
|
|
99
|
-
"model": "llama-2-7b-chat",
|
|
100
|
-
"messages": [{"role": "user", "content": "Hello!"}],
|
|
101
|
-
"temperature": 0.7,
|
|
102
|
-
"max_tokens": 100
|
|
103
|
-
}'
|
|
104
|
-
```
|
|
105
|
-
|
|
106
|
-
## Quantization formats
|
|
107
|
-
|
|
108
|
-
### GGUF format overview
|
|
109
|
-
|
|
110
|
-
| Format | Bits | Size (7B) | Speed | Quality | Use Case |
|
|
111
|
-
|--------|------|-----------|-------|---------|----------|
|
|
112
|
-
| **Q4_K_M** | 4.5 | 4.1 GB | Fast | Good | **Recommended default** |
|
|
113
|
-
| Q4_K_S | 4.3 | 3.9 GB | Faster | Lower | Speed critical |
|
|
114
|
-
| Q5_K_M | 5.5 | 4.8 GB | Medium | Better | Quality critical |
|
|
115
|
-
| Q6_K | 6.5 | 5.5 GB | Slower | Best | Maximum quality |
|
|
116
|
-
| Q8_0 | 8.0 | 7.0 GB | Slow | Excellent | Minimal degradation |
|
|
117
|
-
| Q2_K | 2.5 | 2.7 GB | Fastest | Poor | Testing only |
|
|
118
|
-
|
|
119
|
-
### Choosing quantization
|
|
120
|
-
|
|
121
|
-
```bash
|
|
122
|
-
# General use (balanced)
|
|
123
|
-
Q4_K_M # 4-bit, medium quality
|
|
124
|
-
|
|
125
|
-
# Maximum speed (more degradation)
|
|
126
|
-
Q2_K or Q3_K_M
|
|
127
|
-
|
|
128
|
-
# Maximum quality (slower)
|
|
129
|
-
Q6_K or Q8_0
|
|
130
|
-
|
|
131
|
-
# Very large models (70B, 405B)
|
|
132
|
-
Q3_K_M or Q4_K_S # Lower bits to fit in memory
|
|
133
|
-
```
|
|
134
|
-
|
|
135
|
-
## Hardware acceleration
|
|
136
|
-
|
|
137
|
-
### Apple Silicon (Metal)
|
|
138
|
-
|
|
139
|
-
```bash
|
|
140
|
-
# Build with Metal
|
|
141
|
-
make LLAMA_METAL=1
|
|
142
|
-
|
|
143
|
-
# Run with GPU acceleration (automatic)
|
|
144
|
-
./llama-cli -m model.gguf -ngl 999 # Offload all layers
|
|
145
|
-
|
|
146
|
-
# Performance: M3 Max 40-60 tokens/sec (Llama 2-7B Q4_K_M)
|
|
147
|
-
```
|
|
148
|
-
|
|
149
|
-
### NVIDIA GPUs (CUDA)
|
|
150
|
-
|
|
151
|
-
```bash
|
|
152
|
-
# Build with CUDA
|
|
153
|
-
make LLAMA_CUDA=1
|
|
154
|
-
|
|
155
|
-
# Offload layers to GPU
|
|
156
|
-
./llama-cli -m model.gguf -ngl 35 # Offload 35/40 layers
|
|
157
|
-
|
|
158
|
-
# Hybrid CPU+GPU for large models
|
|
159
|
-
./llama-cli -m llama-70b.Q4_K_M.gguf -ngl 20 # GPU: 20 layers, CPU: rest
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
### AMD GPUs (ROCm)
|
|
163
|
-
|
|
164
|
-
```bash
|
|
165
|
-
# Build with ROCm
|
|
166
|
-
make LLAMA_HIP=1
|
|
167
|
-
|
|
168
|
-
# Run with AMD GPU
|
|
169
|
-
./llama-cli -m model.gguf -ngl 999
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
## Common patterns
|
|
173
|
-
|
|
174
|
-
### Batch processing
|
|
175
|
-
|
|
176
|
-
```bash
|
|
177
|
-
# Process multiple prompts from file
|
|
178
|
-
cat prompts.txt | ./llama-cli \
|
|
179
|
-
-m model.gguf \
|
|
180
|
-
--batch-size 512 \
|
|
181
|
-
-n 100
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
### Constrained generation
|
|
185
|
-
|
|
186
|
-
```bash
|
|
187
|
-
# JSON output with grammar
|
|
188
|
-
./llama-cli \
|
|
189
|
-
-m model.gguf \
|
|
190
|
-
-p "Generate a person: " \
|
|
191
|
-
--grammar-file grammars/json.gbnf
|
|
192
|
-
|
|
193
|
-
# Outputs valid JSON only
|
|
194
|
-
```
|
|
195
|
-
|
|
196
|
-
### Context size
|
|
197
|
-
|
|
198
|
-
```bash
|
|
199
|
-
# Increase context (default 512)
|
|
200
|
-
./llama-cli \
|
|
201
|
-
-m model.gguf \
|
|
202
|
-
-c 4096 # 4K context window
|
|
203
|
-
|
|
204
|
-
# Very long context (if model supports)
|
|
205
|
-
./llama-cli -m model.gguf -c 32768 # 32K context
|
|
206
|
-
```
|
|
207
|
-
|
|
208
|
-
## Performance benchmarks
|
|
209
|
-
|
|
210
|
-
### CPU performance (Llama 2-7B Q4_K_M)
|
|
211
|
-
|
|
212
|
-
| CPU | Threads | Speed | Cost |
|
|
213
|
-
|-----|---------|-------|------|
|
|
214
|
-
| Apple M3 Max | 16 | 50 tok/s | $0 (local) |
|
|
215
|
-
| AMD Ryzen 9 7950X | 32 | 35 tok/s | $0.50/hour |
|
|
216
|
-
| Intel i9-13900K | 32 | 30 tok/s | $0.40/hour |
|
|
217
|
-
| AWS c7i.16xlarge | 64 | 40 tok/s | $2.88/hour |
|
|
218
|
-
|
|
219
|
-
### GPU acceleration (Llama 2-7B Q4_K_M)
|
|
220
|
-
|
|
221
|
-
| GPU | Speed | vs CPU | Cost |
|
|
222
|
-
|-----|-------|--------|------|
|
|
223
|
-
| NVIDIA RTX 4090 | 120 tok/s | 3-4× | $0 (local) |
|
|
224
|
-
| NVIDIA A10 | 80 tok/s | 2-3× | $1.00/hour |
|
|
225
|
-
| AMD MI250 | 70 tok/s | 2× | $2.00/hour |
|
|
226
|
-
| Apple M3 Max (Metal) | 50 tok/s | ~Same | $0 (local) |
|
|
227
|
-
|
|
228
|
-
## Supported models
|
|
229
|
-
|
|
230
|
-
**LLaMA family**:
|
|
231
|
-
- Llama 2 (7B, 13B, 70B)
|
|
232
|
-
- Llama 3 (8B, 70B, 405B)
|
|
233
|
-
- Code Llama
|
|
234
|
-
|
|
235
|
-
**Mistral family**:
|
|
236
|
-
- Mistral 7B
|
|
237
|
-
- Mixtral 8x7B, 8x22B
|
|
238
|
-
|
|
239
|
-
**Other**:
|
|
240
|
-
- Falcon, BLOOM, GPT-J
|
|
241
|
-
- Phi-3, Gemma, Qwen
|
|
242
|
-
- LLaVA (vision), Whisper (audio)
|
|
243
|
-
|
|
244
|
-
**Find models**: https://huggingface.co/models?library=gguf
|
|
245
|
-
|
|
246
|
-
## References
|
|
247
|
-
|
|
248
|
-
- **[Quantization Guide](references/quantization.md)** - GGUF formats, conversion, quality comparison
|
|
249
|
-
- **[Server Deployment](references/server.md)** - API endpoints, Docker, monitoring
|
|
250
|
-
- **[Optimization](references/optimization.md)** - Performance tuning, hybrid CPU+GPU
|
|
251
|
-
|
|
252
|
-
## Resources
|
|
253
|
-
|
|
254
|
-
- **GitHub**: https://github.com/ggerganov/llama.cpp
|
|
255
|
-
- **Models**: https://huggingface.co/models?library=gguf
|
|
256
|
-
- **Discord**: https://discord.gg/llama-cpp
|
|
257
|
-
|
|
258
|
-
|
|
@@ -1,89 +0,0 @@
|
|
|
1
|
-
# Performance Optimization Guide
|
|
2
|
-
|
|
3
|
-
Maximize llama.cpp inference speed and efficiency.
|
|
4
|
-
|
|
5
|
-
## CPU Optimization
|
|
6
|
-
|
|
7
|
-
### Thread tuning
|
|
8
|
-
```bash
|
|
9
|
-
# Set threads (default: physical cores)
|
|
10
|
-
./llama-cli -m model.gguf -t 8
|
|
11
|
-
|
|
12
|
-
# For AMD Ryzen 9 7950X (16 cores, 32 threads)
|
|
13
|
-
-t 16 # Best: physical cores
|
|
14
|
-
|
|
15
|
-
# Avoid hyperthreading (slower for matrix ops)
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
### BLAS acceleration
|
|
19
|
-
```bash
|
|
20
|
-
# OpenBLAS (faster matrix ops)
|
|
21
|
-
make LLAMA_OPENBLAS=1
|
|
22
|
-
|
|
23
|
-
# BLAS gives 2-3× speedup
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## GPU Offloading
|
|
27
|
-
|
|
28
|
-
### Layer offloading
|
|
29
|
-
```bash
|
|
30
|
-
# Offload 35 layers to GPU (hybrid mode)
|
|
31
|
-
./llama-cli -m model.gguf -ngl 35
|
|
32
|
-
|
|
33
|
-
# Offload all layers
|
|
34
|
-
./llama-cli -m model.gguf -ngl 999
|
|
35
|
-
|
|
36
|
-
# Find optimal value:
|
|
37
|
-
# Start with -ngl 999
|
|
38
|
-
# If OOM, reduce by 5 until fits
|
|
39
|
-
```
|
|
40
|
-
|
|
41
|
-
### Memory usage
|
|
42
|
-
```bash
|
|
43
|
-
# Check VRAM usage
|
|
44
|
-
nvidia-smi dmon
|
|
45
|
-
|
|
46
|
-
# Reduce context if needed
|
|
47
|
-
./llama-cli -m model.gguf -c 2048 # 2K context instead of 4K
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
## Batch Processing
|
|
51
|
-
|
|
52
|
-
```bash
|
|
53
|
-
# Increase batch size for throughput
|
|
54
|
-
./llama-cli -m model.gguf -b 512 # Default: 512
|
|
55
|
-
|
|
56
|
-
# Physical batch (GPU)
|
|
57
|
-
--ubatch 128 # Process 128 tokens at once
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
## Context Management
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
# Default context (512 tokens)
|
|
64
|
-
-c 512
|
|
65
|
-
|
|
66
|
-
# Longer context (slower, more memory)
|
|
67
|
-
-c 4096
|
|
68
|
-
|
|
69
|
-
# Very long context (if model supports)
|
|
70
|
-
-c 32768
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
## Benchmarks
|
|
74
|
-
|
|
75
|
-
### CPU Performance (Llama 2-7B Q4_K_M)
|
|
76
|
-
|
|
77
|
-
| Setup | Speed | Notes |
|
|
78
|
-
|-------|-------|-------|
|
|
79
|
-
| Apple M3 Max | 50 tok/s | Metal acceleration |
|
|
80
|
-
| AMD 7950X (16c) | 35 tok/s | OpenBLAS |
|
|
81
|
-
| Intel i9-13900K | 30 tok/s | AVX2 |
|
|
82
|
-
|
|
83
|
-
### GPU Offloading (RTX 4090)
|
|
84
|
-
|
|
85
|
-
| Layers GPU | Speed | VRAM |
|
|
86
|
-
|------------|-------|------|
|
|
87
|
-
| 0 (CPU only) | 30 tok/s | 0 GB |
|
|
88
|
-
| 20 (hybrid) | 80 tok/s | 8 GB |
|
|
89
|
-
| 35 (all) | 120 tok/s | 12 GB |
|
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
# GGUF Quantization Guide
|
|
2
|
-
|
|
3
|
-
Complete guide to GGUF quantization formats and model conversion.
|
|
4
|
-
|
|
5
|
-
## Quantization Overview
|
|
6
|
-
|
|
7
|
-
**GGUF** (GPT-Generated Unified Format) - Standard format for llama.cpp models.
|
|
8
|
-
|
|
9
|
-
### Format Comparison
|
|
10
|
-
|
|
11
|
-
| Format | Perplexity | Size (7B) | Tokens/sec | Notes |
|
|
12
|
-
|--------|------------|-----------|------------|-------|
|
|
13
|
-
| FP16 | 5.9565 (baseline) | 13.0 GB | 15 tok/s | Original quality |
|
|
14
|
-
| Q8_0 | 5.9584 (+0.03%) | 7.0 GB | 25 tok/s | Nearly lossless |
|
|
15
|
-
| **Q6_K** | 5.9642 (+0.13%) | 5.5 GB | 30 tok/s | Best quality/size |
|
|
16
|
-
| **Q5_K_M** | 5.9796 (+0.39%) | 4.8 GB | 35 tok/s | Balanced |
|
|
17
|
-
| **Q4_K_M** | 6.0565 (+1.68%) | 4.1 GB | 40 tok/s | **Recommended** |
|
|
18
|
-
| Q4_K_S | 6.1125 (+2.62%) | 3.9 GB | 42 tok/s | Faster, lower quality |
|
|
19
|
-
| Q3_K_M | 6.3184 (+6.07%) | 3.3 GB | 45 tok/s | Small models only |
|
|
20
|
-
| Q2_K | 6.8673 (+15.3%) | 2.7 GB | 50 tok/s | Not recommended |
|
|
21
|
-
|
|
22
|
-
**Recommendation**: Use **Q4_K_M** for best balance of quality and speed.
|
|
23
|
-
|
|
24
|
-
## Converting Models
|
|
25
|
-
|
|
26
|
-
### HuggingFace to GGUF
|
|
27
|
-
|
|
28
|
-
```bash
|
|
29
|
-
# 1. Download HuggingFace model
|
|
30
|
-
huggingface-cli download meta-llama/Llama-2-7b-chat-hf \
|
|
31
|
-
--local-dir models/llama-2-7b-chat/
|
|
32
|
-
|
|
33
|
-
# 2. Convert to FP16 GGUF
|
|
34
|
-
python convert_hf_to_gguf.py \
|
|
35
|
-
models/llama-2-7b-chat/ \
|
|
36
|
-
--outtype f16 \
|
|
37
|
-
--outfile models/llama-2-7b-chat-f16.gguf
|
|
38
|
-
|
|
39
|
-
# 3. Quantize to Q4_K_M
|
|
40
|
-
./llama-quantize \
|
|
41
|
-
models/llama-2-7b-chat-f16.gguf \
|
|
42
|
-
models/llama-2-7b-chat-Q4_K_M.gguf \
|
|
43
|
-
Q4_K_M
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
### Batch quantization
|
|
47
|
-
|
|
48
|
-
```bash
|
|
49
|
-
# Quantize to multiple formats
|
|
50
|
-
for quant in Q4_K_M Q5_K_M Q6_K Q8_0; do
|
|
51
|
-
./llama-quantize \
|
|
52
|
-
model-f16.gguf \
|
|
53
|
-
model-${quant}.gguf \
|
|
54
|
-
$quant
|
|
55
|
-
done
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
## K-Quantization Methods
|
|
59
|
-
|
|
60
|
-
**K-quants** use mixed precision for better quality:
|
|
61
|
-
- Attention weights: Higher precision
|
|
62
|
-
- Feed-forward weights: Lower precision
|
|
63
|
-
|
|
64
|
-
**Variants**:
|
|
65
|
-
- `_S` (Small): Faster, lower quality
|
|
66
|
-
- `_M` (Medium): Balanced (recommended)
|
|
67
|
-
- `_L` (Large): Better quality, larger size
|
|
68
|
-
|
|
69
|
-
**Example**: `Q4_K_M`
|
|
70
|
-
- `Q4`: 4-bit quantization
|
|
71
|
-
- `K`: Mixed precision method
|
|
72
|
-
- `M`: Medium quality
|
|
73
|
-
|
|
74
|
-
## Quality Testing
|
|
75
|
-
|
|
76
|
-
```bash
|
|
77
|
-
# Calculate perplexity (quality metric)
|
|
78
|
-
./llama-perplexity \
|
|
79
|
-
-m model.gguf \
|
|
80
|
-
-f wikitext-2-raw/wiki.test.raw \
|
|
81
|
-
-c 512
|
|
82
|
-
|
|
83
|
-
# Lower perplexity = better quality
|
|
84
|
-
# Baseline (FP16): ~5.96
|
|
85
|
-
# Q4_K_M: ~6.06 (+1.7%)
|
|
86
|
-
# Q2_K: ~6.87 (+15.3% - too much degradation)
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
## Use Case Guide
|
|
90
|
-
|
|
91
|
-
### General purpose (chatbots, assistants)
|
|
92
|
-
```
|
|
93
|
-
Q4_K_M - Best balance
|
|
94
|
-
Q5_K_M - If you have extra RAM
|
|
95
|
-
```
|
|
96
|
-
|
|
97
|
-
### Code generation
|
|
98
|
-
```
|
|
99
|
-
Q5_K_M or Q6_K - Higher precision helps with code
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
### Creative writing
|
|
103
|
-
```
|
|
104
|
-
Q4_K_M - Sufficient quality
|
|
105
|
-
Q3_K_M - Acceptable for draft generation
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
### Technical/medical
|
|
109
|
-
```
|
|
110
|
-
Q6_K or Q8_0 - Maximum accuracy
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
### Edge devices (Raspberry Pi)
|
|
114
|
-
```
|
|
115
|
-
Q2_K or Q3_K_S - Fit in limited RAM
|
|
116
|
-
```
|
|
117
|
-
|
|
118
|
-
## Model Size Scaling
|
|
119
|
-
|
|
120
|
-
### 7B parameter models
|
|
121
|
-
|
|
122
|
-
| Format | Size | RAM needed |
|
|
123
|
-
|--------|------|------------|
|
|
124
|
-
| Q2_K | 2.7 GB | 5 GB |
|
|
125
|
-
| Q3_K_M | 3.3 GB | 6 GB |
|
|
126
|
-
| Q4_K_M | 4.1 GB | 7 GB |
|
|
127
|
-
| Q5_K_M | 4.8 GB | 8 GB |
|
|
128
|
-
| Q6_K | 5.5 GB | 9 GB |
|
|
129
|
-
| Q8_0 | 7.0 GB | 11 GB |
|
|
130
|
-
|
|
131
|
-
### 13B parameter models
|
|
132
|
-
|
|
133
|
-
| Format | Size | RAM needed |
|
|
134
|
-
|--------|------|------------|
|
|
135
|
-
| Q2_K | 5.1 GB | 8 GB |
|
|
136
|
-
| Q3_K_M | 6.2 GB | 10 GB |
|
|
137
|
-
| Q4_K_M | 7.9 GB | 12 GB |
|
|
138
|
-
| Q5_K_M | 9.2 GB | 14 GB |
|
|
139
|
-
| Q6_K | 10.7 GB | 16 GB |
|
|
140
|
-
|
|
141
|
-
### 70B parameter models
|
|
142
|
-
|
|
143
|
-
| Format | Size | RAM needed |
|
|
144
|
-
|--------|------|------------|
|
|
145
|
-
| Q2_K | 26 GB | 32 GB |
|
|
146
|
-
| Q3_K_M | 32 GB | 40 GB |
|
|
147
|
-
| Q4_K_M | 41 GB | 48 GB |
|
|
148
|
-
| Q4_K_S | 39 GB | 46 GB |
|
|
149
|
-
| Q5_K_M | 48 GB | 56 GB |
|
|
150
|
-
|
|
151
|
-
**Recommendation for 70B**: Use Q3_K_M or Q4_K_S to fit in consumer hardware.
|
|
152
|
-
|
|
153
|
-
## Finding Pre-Quantized Models
|
|
154
|
-
|
|
155
|
-
**TheBloke** on HuggingFace:
|
|
156
|
-
- https://huggingface.co/TheBloke
|
|
157
|
-
- Most models available in all GGUF formats
|
|
158
|
-
- No conversion needed
|
|
159
|
-
|
|
160
|
-
**Example**:
|
|
161
|
-
```bash
|
|
162
|
-
# Download pre-quantized Llama 2-7B
|
|
163
|
-
huggingface-cli download \
|
|
164
|
-
TheBloke/Llama-2-7B-Chat-GGUF \
|
|
165
|
-
llama-2-7b-chat.Q4_K_M.gguf \
|
|
166
|
-
--local-dir models/
|
|
167
|
-
```
|
|
168
|
-
|
|
169
|
-
## Importance Matrices (imatrix)
|
|
170
|
-
|
|
171
|
-
**What**: Calibration data to improve quantization quality.
|
|
172
|
-
|
|
173
|
-
**Benefits**:
|
|
174
|
-
- 10-20% perplexity improvement with Q4
|
|
175
|
-
- Essential for Q3 and below
|
|
176
|
-
|
|
177
|
-
**Usage**:
|
|
178
|
-
```bash
|
|
179
|
-
# 1. Generate importance matrix
|
|
180
|
-
./llama-imatrix \
|
|
181
|
-
-m model-f16.gguf \
|
|
182
|
-
-f calibration-data.txt \
|
|
183
|
-
-o model.imatrix
|
|
184
|
-
|
|
185
|
-
# 2. Quantize with imatrix
|
|
186
|
-
./llama-quantize \
|
|
187
|
-
--imatrix model.imatrix \
|
|
188
|
-
model-f16.gguf \
|
|
189
|
-
model-Q4_K_M.gguf \
|
|
190
|
-
Q4_K_M
|
|
191
|
-
```
|
|
192
|
-
|
|
193
|
-
**Calibration data**:
|
|
194
|
-
- Use domain-specific text (e.g., code for code models)
|
|
195
|
-
- ~100MB of representative text
|
|
196
|
-
- Higher quality data = better quantization
|
|
197
|
-
|
|
198
|
-
## Troubleshooting
|
|
199
|
-
|
|
200
|
-
**Model outputs gibberish**:
|
|
201
|
-
- Quantization too aggressive (Q2_K)
|
|
202
|
-
- Try Q4_K_M or Q5_K_M
|
|
203
|
-
- Verify model converted correctly
|
|
204
|
-
|
|
205
|
-
**Out of memory**:
|
|
206
|
-
- Use lower quantization (Q4_K_S instead of Q5_K_M)
|
|
207
|
-
- Offload fewer layers to GPU (`-ngl`)
|
|
208
|
-
- Use smaller context (`-c 2048`)
|
|
209
|
-
|
|
210
|
-
**Slow inference**:
|
|
211
|
-
- Higher quantization uses more compute
|
|
212
|
-
- Q8_0 much slower than Q4_K_M
|
|
213
|
-
- Consider speed vs quality trade-off
|
|
@@ -1,125 +0,0 @@
|
|
|
1
|
-
# Server Deployment Guide
|
|
2
|
-
|
|
3
|
-
Production deployment of llama.cpp server with OpenAI-compatible API.
|
|
4
|
-
|
|
5
|
-
## Server Modes
|
|
6
|
-
|
|
7
|
-
### llama-server
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
# Basic server
|
|
11
|
-
./llama-server \
|
|
12
|
-
-m models/llama-2-7b-chat.Q4_K_M.gguf \
|
|
13
|
-
--host 0.0.0.0 \
|
|
14
|
-
--port 8080 \
|
|
15
|
-
-c 4096 # Context size
|
|
16
|
-
|
|
17
|
-
# With GPU acceleration
|
|
18
|
-
./llama-server \
|
|
19
|
-
-m models/llama-2-70b.Q4_K_M.gguf \
|
|
20
|
-
-ngl 40 # Offload 40 layers to GPU
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
## OpenAI-Compatible API
|
|
24
|
-
|
|
25
|
-
### Chat completions
|
|
26
|
-
```bash
|
|
27
|
-
curl http://localhost:8080/v1/chat/completions \
|
|
28
|
-
-H "Content-Type: application/json" \
|
|
29
|
-
-d '{
|
|
30
|
-
"model": "llama-2",
|
|
31
|
-
"messages": [
|
|
32
|
-
{"role": "system", "content": "You are helpful"},
|
|
33
|
-
{"role": "user", "content": "Hello"}
|
|
34
|
-
],
|
|
35
|
-
"temperature": 0.7,
|
|
36
|
-
"max_tokens": 100
|
|
37
|
-
}'
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
### Streaming
|
|
41
|
-
```bash
|
|
42
|
-
curl http://localhost:8080/v1/chat/completions \
|
|
43
|
-
-H "Content-Type: application/json" \
|
|
44
|
-
-d '{
|
|
45
|
-
"model": "llama-2",
|
|
46
|
-
"messages": [{"role": "user", "content": "Count to 10"}],
|
|
47
|
-
"stream": true
|
|
48
|
-
}'
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
## Docker Deployment
|
|
52
|
-
|
|
53
|
-
**Dockerfile**:
|
|
54
|
-
```dockerfile
|
|
55
|
-
FROM ubuntu:22.04
|
|
56
|
-
RUN apt-get update && apt-get install -y git build-essential
|
|
57
|
-
RUN git clone https://github.com/ggerganov/llama.cpp
|
|
58
|
-
WORKDIR /llama.cpp
|
|
59
|
-
RUN make LLAMA_CUDA=1
|
|
60
|
-
COPY models/ /models/
|
|
61
|
-
EXPOSE 8080
|
|
62
|
-
CMD ["./llama-server", "-m", "/models/model.gguf", "--host", "0.0.0.0", "--port", "8080"]
|
|
63
|
-
```
|
|
64
|
-
|
|
65
|
-
**Run**:
|
|
66
|
-
```bash
|
|
67
|
-
docker run --gpus all -p 8080:8080 llama-cpp:latest
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
## Monitoring
|
|
71
|
-
|
|
72
|
-
```bash
|
|
73
|
-
# Server metrics endpoint
|
|
74
|
-
curl http://localhost:8080/metrics
|
|
75
|
-
|
|
76
|
-
# Health check
|
|
77
|
-
curl http://localhost:8080/health
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
**Metrics**:
|
|
81
|
-
- requests_total
|
|
82
|
-
- tokens_generated
|
|
83
|
-
- prompt_tokens
|
|
84
|
-
- completion_tokens
|
|
85
|
-
- kv_cache_tokens
|
|
86
|
-
|
|
87
|
-
## Load Balancing
|
|
88
|
-
|
|
89
|
-
**NGINX**:
|
|
90
|
-
```nginx
|
|
91
|
-
upstream llama_cpp {
|
|
92
|
-
server llama1:8080;
|
|
93
|
-
server llama2:8080;
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
server {
|
|
97
|
-
location / {
|
|
98
|
-
proxy_pass http://llama_cpp;
|
|
99
|
-
proxy_read_timeout 300s;
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Performance Tuning
|
|
105
|
-
|
|
106
|
-
**Parallel requests**:
|
|
107
|
-
```bash
|
|
108
|
-
./llama-server \
|
|
109
|
-
-m model.gguf \
|
|
110
|
-
-np 4 # 4 parallel slots
|
|
111
|
-
```
|
|
112
|
-
|
|
113
|
-
**Continuous batching**:
|
|
114
|
-
```bash
|
|
115
|
-
./llama-server \
|
|
116
|
-
-m model.gguf \
|
|
117
|
-
--cont-batching # Enable continuous batching
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
**Context caching**:
|
|
121
|
-
```bash
|
|
122
|
-
./llama-server \
|
|
123
|
-
-m model.gguf \
|
|
124
|
-
--cache-prompt # Cache processed prompts
|
|
125
|
-
```
|