kvcache-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kvcache_bench-0.1.0/LICENSE +15 -0
- kvcache_bench-0.1.0/PKG-INFO +139 -0
- kvcache_bench-0.1.0/README.md +115 -0
- kvcache_bench-0.1.0/kvcache_bench/__init__.py +10 -0
- kvcache_bench-0.1.0/kvcache_bench/bench.py +256 -0
- kvcache_bench-0.1.0/kvcache_bench/cli.py +124 -0
- kvcache_bench-0.1.0/kvcache_bench/gpu.py +79 -0
- kvcache_bench-0.1.0/kvcache_bench/ollama.py +133 -0
- kvcache_bench-0.1.0/kvcache_bench.egg-info/PKG-INFO +139 -0
- kvcache_bench-0.1.0/kvcache_bench.egg-info/SOURCES.txt +14 -0
- kvcache_bench-0.1.0/kvcache_bench.egg-info/dependency_links.txt +1 -0
- kvcache_bench-0.1.0/kvcache_bench.egg-info/entry_points.txt +2 -0
- kvcache_bench-0.1.0/kvcache_bench.egg-info/requires.txt +9 -0
- kvcache_bench-0.1.0/kvcache_bench.egg-info/top_level.txt +1 -0
- kvcache_bench-0.1.0/pyproject.toml +33 -0
- kvcache_bench-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kvcache-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Benchmark every KV cache compression method on your GPU. One command, real numbers.
|
|
5
|
+
Author: back2matching
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/back2matching/kvcache-bench
|
|
8
|
+
Keywords: llm,kv-cache,benchmark,vram,gpu,ollama,llama-cpp,quantization
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: requests>=2.28.0
|
|
17
|
+
Requires-Dist: pynvml>=11.5.0
|
|
18
|
+
Provides-Extra: charts
|
|
19
|
+
Requires-Dist: matplotlib; extra == "charts"
|
|
20
|
+
Provides-Extra: hf
|
|
21
|
+
Requires-Dist: torch; extra == "hf"
|
|
22
|
+
Requires-Dist: transformers; extra == "hf"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# kvcache-bench
|
|
26
|
+
|
|
27
|
+
Benchmark every KV cache compression method on your GPU. One command, real numbers.
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
kvcache-bench --model qwen3.5:9b
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
| KV Type | Context | Prompt | Gen tok/s | Prefill tok/s | VRAM +MB | Quality |
|
|
35
|
+
|---------|---------|-----------|-----------|---------------|----------|---------|
|
|
36
|
+
| f16 | 4096 | short | 80.1 | 712.3 | +142 | PASS |
|
|
37
|
+
| q8_0 | 4096 | short | 79.5 | 723.5 | +71 | PASS |
|
|
38
|
+
| q4_0 | 4096 | short | 78.2 | 698.1 | +36 | PASS |
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Why
|
|
42
|
+
|
|
43
|
+
When you run a local LLM, the KV cache eats your VRAM. Ollama and llama.cpp support different KV cache quantization types (f16, q8_0, q4_0), but nobody tells you what the actual tradeoff is on YOUR hardware.
|
|
44
|
+
|
|
45
|
+
Current state of the world:
|
|
46
|
+
- You Google "ollama kv cache quantization" and find forum posts with conflicting advice
|
|
47
|
+
- You manually test each config, eyeball nvidia-smi, and guess
|
|
48
|
+
- No tool compares them systematically
|
|
49
|
+
|
|
50
|
+
kvcache-bench fixes this. It tests every KV cache type on your GPU and gives you a comparison table with speed, VRAM, and quality.
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install kvcache-bench
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Usage
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Auto-detect your first model, test all KV types
|
|
62
|
+
kvcache-bench
|
|
63
|
+
|
|
64
|
+
# Specific model
|
|
65
|
+
kvcache-bench --model qwen3.5:9b
|
|
66
|
+
|
|
67
|
+
# Test at multiple context lengths (where KV savings matter most)
|
|
68
|
+
kvcache-bench --model llama3.1:8b --context 4096,8192,16384
|
|
69
|
+
|
|
70
|
+
# Include tool calling test
|
|
71
|
+
kvcache-bench --model qwen3.5:9b --prompts short,code,reasoning,tool_call
|
|
72
|
+
|
|
73
|
+
# Save results as JSON
|
|
74
|
+
kvcache-bench --model qwen3.5:9b --json results.json
|
|
75
|
+
|
|
76
|
+
# Just show GPU info
|
|
77
|
+
kvcache-bench --gpu
|
|
78
|
+
|
|
79
|
+
# List available models
|
|
80
|
+
kvcache-bench --list-models
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## What It Tests
|
|
84
|
+
|
|
85
|
+
For each KV cache type (f16, q8_0, q4_0), it measures:
|
|
86
|
+
|
|
87
|
+
| Metric | How |
|
|
88
|
+
|--------|-----|
|
|
89
|
+
| **Generation speed** | Tokens per second during generation |
|
|
90
|
+
| **Prefill speed** | Tokens per second processing the prompt |
|
|
91
|
+
| **VRAM delta** | Extra VRAM used beyond model weights (measured via nvidia-smi) |
|
|
92
|
+
| **Quality** | Auto-checked against expected answers (Paris, code structure, reasoning) |
|
|
93
|
+
|
|
94
|
+
## How It Works
|
|
95
|
+
|
|
96
|
+
1. Detects your GPU and Ollama installation
|
|
97
|
+
2. For each KV cache type: restarts Ollama with `OLLAMA_KV_CACHE_TYPE=<type>`, warms up the model, runs benchmark prompts
|
|
98
|
+
3. Measures VRAM before and during inference via nvidia-smi
|
|
99
|
+
4. Extracts timing from Ollama's API response (prompt_eval_duration, eval_duration)
|
|
100
|
+
5. Checks response quality with simple auto-graders
|
|
101
|
+
6. Produces a markdown table (and optional JSON)
|
|
102
|
+
|
|
103
|
+
## What the Research Says
|
|
104
|
+
|
|
105
|
+
Based on llama.cpp community benchmarks and our testing:
|
|
106
|
+
|
|
107
|
+
| KV Type | VRAM Savings | Perplexity Impact | Best For |
|
|
108
|
+
|---------|-------------|-------------------|----------|
|
|
109
|
+
| f16 | Baseline | None | When you have VRAM to spare |
|
|
110
|
+
| q8_0 | 2x | +0.004 (negligible) | **Default recommendation.** Free VRAM, zero quality cost. |
|
|
111
|
+
| q4_0 | 4x | +0.2 (noticeable) | When you need max context length or are VRAM-constrained |
|
|
112
|
+
|
|
113
|
+
The sweet spot for most users: **q8_0**. Halves your KV cache VRAM with essentially zero quality loss.
|
|
114
|
+
|
|
115
|
+
## Requirements
|
|
116
|
+
|
|
117
|
+
- Python 3.10+
|
|
118
|
+
- NVIDIA GPU with nvidia-smi
|
|
119
|
+
- Ollama installed and running
|
|
120
|
+
|
|
121
|
+
## Roadmap
|
|
122
|
+
|
|
123
|
+
- [ ] Mixed K/V types (q8 keys + q4 values)
|
|
124
|
+
- [ ] Context length sweep charts
|
|
125
|
+
- [ ] HuggingFace backend (vLLM, TGI)
|
|
126
|
+
- [ ] TurboQuant integration
|
|
127
|
+
- [ ] Multi-model matrix
|
|
128
|
+
- [ ] HuggingFace Spaces leaderboard
|
|
129
|
+
- [ ] Community result submissions
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
Apache 2.0
|
|
134
|
+
|
|
135
|
+
## Related
|
|
136
|
+
|
|
137
|
+
- [turboquant](https://github.com/back2matching/turboquant) -- TurboQuant KV cache compression (sub-4-bit)
|
|
138
|
+
- [NVIDIA kvpress](https://github.com/NVIDIA/kvpress) -- KV cache eviction/pruning methods
|
|
139
|
+
- [llama.cpp](https://github.com/ggml-org/llama.cpp) -- Where KV cache quantization lives
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# kvcache-bench
|
|
2
|
+
|
|
3
|
+
Benchmark every KV cache compression method on your GPU. One command, real numbers.
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
kvcache-bench --model qwen3.5:9b
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
| KV Type | Context | Prompt | Gen tok/s | Prefill tok/s | VRAM +MB | Quality |
|
|
11
|
+
|---------|---------|-----------|-----------|---------------|----------|---------|
|
|
12
|
+
| f16 | 4096 | short | 80.1 | 712.3 | +142 | PASS |
|
|
13
|
+
| q8_0 | 4096 | short | 79.5 | 723.5 | +71 | PASS |
|
|
14
|
+
| q4_0 | 4096 | short | 78.2 | 698.1 | +36 | PASS |
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Why
|
|
18
|
+
|
|
19
|
+
When you run a local LLM, the KV cache eats your VRAM. Ollama and llama.cpp support different KV cache quantization types (f16, q8_0, q4_0), but nobody tells you what the actual tradeoff is on YOUR hardware.
|
|
20
|
+
|
|
21
|
+
Current state of the world:
|
|
22
|
+
- You Google "ollama kv cache quantization" and find forum posts with conflicting advice
|
|
23
|
+
- You manually test each config, eyeball nvidia-smi, and guess
|
|
24
|
+
- No tool compares them systematically
|
|
25
|
+
|
|
26
|
+
kvcache-bench fixes this. It tests every KV cache type on your GPU and gives you a comparison table with speed, VRAM, and quality.
|
|
27
|
+
|
|
28
|
+
## Install
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install kvcache-bench
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# Auto-detect your first model, test all KV types
|
|
38
|
+
kvcache-bench
|
|
39
|
+
|
|
40
|
+
# Specific model
|
|
41
|
+
kvcache-bench --model qwen3.5:9b
|
|
42
|
+
|
|
43
|
+
# Test at multiple context lengths (where KV savings matter most)
|
|
44
|
+
kvcache-bench --model llama3.1:8b --context 4096,8192,16384
|
|
45
|
+
|
|
46
|
+
# Include tool calling test
|
|
47
|
+
kvcache-bench --model qwen3.5:9b --prompts short,code,reasoning,tool_call
|
|
48
|
+
|
|
49
|
+
# Save results as JSON
|
|
50
|
+
kvcache-bench --model qwen3.5:9b --json results.json
|
|
51
|
+
|
|
52
|
+
# Just show GPU info
|
|
53
|
+
kvcache-bench --gpu
|
|
54
|
+
|
|
55
|
+
# List available models
|
|
56
|
+
kvcache-bench --list-models
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## What It Tests
|
|
60
|
+
|
|
61
|
+
For each KV cache type (f16, q8_0, q4_0), it measures:
|
|
62
|
+
|
|
63
|
+
| Metric | How |
|
|
64
|
+
|--------|-----|
|
|
65
|
+
| **Generation speed** | Tokens per second during generation |
|
|
66
|
+
| **Prefill speed** | Tokens per second processing the prompt |
|
|
67
|
+
| **VRAM delta** | Extra VRAM used beyond model weights (measured via nvidia-smi) |
|
|
68
|
+
| **Quality** | Auto-checked against expected answers (Paris, code structure, reasoning) |
|
|
69
|
+
|
|
70
|
+
## How It Works
|
|
71
|
+
|
|
72
|
+
1. Detects your GPU and Ollama installation
|
|
73
|
+
2. For each KV cache type: restarts Ollama with `OLLAMA_KV_CACHE_TYPE=<type>`, warms up the model, runs benchmark prompts
|
|
74
|
+
3. Measures VRAM before and during inference via nvidia-smi
|
|
75
|
+
4. Extracts timing from Ollama's API response (prompt_eval_duration, eval_duration)
|
|
76
|
+
5. Checks response quality with simple auto-graders
|
|
77
|
+
6. Produces a markdown table (and optional JSON)
|
|
78
|
+
|
|
79
|
+
## What the Research Says
|
|
80
|
+
|
|
81
|
+
Based on llama.cpp community benchmarks and our testing:
|
|
82
|
+
|
|
83
|
+
| KV Type | VRAM Savings | Perplexity Impact | Best For |
|
|
84
|
+
|---------|-------------|-------------------|----------|
|
|
85
|
+
| f16 | Baseline | None | When you have VRAM to spare |
|
|
86
|
+
| q8_0 | 2x | +0.004 (negligible) | **Default recommendation.** Free VRAM, zero quality cost. |
|
|
87
|
+
| q4_0 | 4x | +0.2 (noticeable) | When you need max context length or are VRAM-constrained |
|
|
88
|
+
|
|
89
|
+
The sweet spot for most users: **q8_0**. Halves your KV cache VRAM with essentially zero quality loss.
|
|
90
|
+
|
|
91
|
+
## Requirements
|
|
92
|
+
|
|
93
|
+
- Python 3.10+
|
|
94
|
+
- NVIDIA GPU with nvidia-smi
|
|
95
|
+
- Ollama installed and running
|
|
96
|
+
|
|
97
|
+
## Roadmap
|
|
98
|
+
|
|
99
|
+
- [ ] Mixed K/V types (q8 keys + q4 values)
|
|
100
|
+
- [ ] Context length sweep charts
|
|
101
|
+
- [ ] HuggingFace backend (vLLM, TGI)
|
|
102
|
+
- [ ] TurboQuant integration
|
|
103
|
+
- [ ] Multi-model matrix
|
|
104
|
+
- [ ] HuggingFace Spaces leaderboard
|
|
105
|
+
- [ ] Community result submissions
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
Apache 2.0
|
|
110
|
+
|
|
111
|
+
## Related
|
|
112
|
+
|
|
113
|
+
- [turboquant](https://github.com/back2matching/turboquant) -- TurboQuant KV cache compression (sub-4-bit)
|
|
114
|
+
- [NVIDIA kvpress](https://github.com/NVIDIA/kvpress) -- KV cache eviction/pruning methods
|
|
115
|
+
- [llama.cpp](https://github.com/ggml-org/llama.cpp) -- Where KV cache quantization lives
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""
|
|
2
|
+
kvcache-bench: Benchmark every KV cache compression method on your GPU.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
kvcache-bench --model qwen3.5:9b
|
|
6
|
+
kvcache-bench --model llama3.1:8b --context 8192,16384,32768
|
|
7
|
+
kvcache-bench --all-types --json results.json
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Core benchmark engine: runs comparison across KV cache types."""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
from dataclasses import dataclass, asdict, field
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from kvcache_bench.gpu import detect_gpu, measure_vram, VramTracker
|
|
10
|
+
from kvcache_bench.ollama import (
|
|
11
|
+
check_ollama, list_models, run_inference, run_chat,
|
|
12
|
+
BENCH_PROMPTS, BENCH_TOOL, KV_TYPES, MIXED_KV,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class BenchResult:
|
|
18
|
+
model: str
|
|
19
|
+
kv_type: str # "f16", "q8_0", "q4_0", or "q8_0/q4_0" for mixed
|
|
20
|
+
context_length: int
|
|
21
|
+
prompt_type: str
|
|
22
|
+
prompt_tokens: int
|
|
23
|
+
generated_tokens: int
|
|
24
|
+
prompt_eval_rate: float # tok/s
|
|
25
|
+
eval_rate: float # tok/s
|
|
26
|
+
vram_baseline_mb: int
|
|
27
|
+
vram_peak_mb: int
|
|
28
|
+
vram_delta_mb: int
|
|
29
|
+
total_time_s: float
|
|
30
|
+
response_preview: str
|
|
31
|
+
correct: Optional[bool] = None # For quality checks
|
|
32
|
+
error: Optional[str] = None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def set_kv_cache_env(kv_type_k: str, kv_type_v: str):
|
|
36
|
+
"""Set Ollama KV cache environment variables. Requires Ollama restart."""
|
|
37
|
+
os.environ["OLLAMA_KV_CACHE_TYPE"] = kv_type_k
|
|
38
|
+
# Note: Ollama doesn't support separate K/V types via env vars.
|
|
39
|
+
# For mixed K/V testing, we'd need llama.cpp server directly.
|
|
40
|
+
# For now, both K and V use the same type.
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def restart_ollama_with_kv(kv_type: str) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Restart Ollama with a new KV cache type.
|
|
46
|
+
|
|
47
|
+
This is the key insight: Ollama's KV cache type is server-level,
|
|
48
|
+
so we must restart between tests. We kill, set env, restart.
|
|
49
|
+
"""
|
|
50
|
+
# On Windows, Ollama runs as a tray app or service
|
|
51
|
+
try:
|
|
52
|
+
# Kill existing Ollama
|
|
53
|
+
if os.name == 'nt':
|
|
54
|
+
subprocess.run(["taskkill", "/F", "/IM", "ollama.exe"], capture_output=True, timeout=5)
|
|
55
|
+
subprocess.run(["taskkill", "/F", "/IM", "ollama app.exe"], capture_output=True, timeout=5)
|
|
56
|
+
else:
|
|
57
|
+
subprocess.run(["pkill", "-f", "ollama"], capture_output=True, timeout=5)
|
|
58
|
+
|
|
59
|
+
time.sleep(3)
|
|
60
|
+
|
|
61
|
+
# Set env vars
|
|
62
|
+
os.environ["OLLAMA_FLASH_ATTENTION"] = "1"
|
|
63
|
+
os.environ["OLLAMA_KV_CACHE_TYPE"] = kv_type
|
|
64
|
+
os.environ["OLLAMA_NUM_PARALLEL"] = "1" # Consistent for benchmarking
|
|
65
|
+
|
|
66
|
+
# Start Ollama
|
|
67
|
+
if os.name == 'nt':
|
|
68
|
+
ollama_path = os.path.expandvars(r"%LOCALAPPDATA%\Programs\Ollama\ollama.exe")
|
|
69
|
+
if not os.path.exists(ollama_path):
|
|
70
|
+
ollama_path = "ollama" # Fall back to PATH
|
|
71
|
+
subprocess.Popen(
|
|
72
|
+
[ollama_path, "serve"],
|
|
73
|
+
env={**os.environ},
|
|
74
|
+
stdout=subprocess.DEVNULL,
|
|
75
|
+
stderr=subprocess.DEVNULL,
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
subprocess.Popen(
|
|
79
|
+
["ollama", "serve"],
|
|
80
|
+
env={**os.environ},
|
|
81
|
+
stdout=subprocess.DEVNULL,
|
|
82
|
+
stderr=subprocess.DEVNULL,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Wait for it to come up
|
|
86
|
+
for _ in range(30):
|
|
87
|
+
time.sleep(1)
|
|
88
|
+
if check_ollama():
|
|
89
|
+
return True
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
except Exception as e:
|
|
93
|
+
print(f" Failed to restart Ollama: {e}")
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def check_quality(prompt_type: str, response: str) -> Optional[bool]:
|
|
98
|
+
"""Simple quality checks for standard prompts."""
|
|
99
|
+
# Strip thinking tags (Qwen3.5 uses <think>...</think>)
|
|
100
|
+
import re
|
|
101
|
+
clean = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
|
|
102
|
+
if not clean:
|
|
103
|
+
clean = response # Fallback to full response
|
|
104
|
+
clean_lower = clean.lower().strip()
|
|
105
|
+
|
|
106
|
+
if prompt_type == "short":
|
|
107
|
+
return "paris" in clean_lower
|
|
108
|
+
elif prompt_type == "reasoning":
|
|
109
|
+
return "9" in clean
|
|
110
|
+
elif prompt_type == "code":
|
|
111
|
+
return "def " in clean and "return" in clean
|
|
112
|
+
return None # Can't auto-check
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def run_single_bench(
|
|
116
|
+
model: str,
|
|
117
|
+
kv_type: str,
|
|
118
|
+
prompt_type: str,
|
|
119
|
+
context_length: int = 4096,
|
|
120
|
+
) -> BenchResult:
|
|
121
|
+
"""Run a single benchmark measurement."""
|
|
122
|
+
prompt = BENCH_PROMPTS.get(prompt_type, prompt_type)
|
|
123
|
+
|
|
124
|
+
tracker = VramTracker()
|
|
125
|
+
tracker.start()
|
|
126
|
+
|
|
127
|
+
t0 = time.perf_counter()
|
|
128
|
+
|
|
129
|
+
if prompt_type == "tool_call":
|
|
130
|
+
result = run_chat(
|
|
131
|
+
model,
|
|
132
|
+
[{"role": "user", "content": prompt}],
|
|
133
|
+
num_ctx=context_length,
|
|
134
|
+
max_tokens=100,
|
|
135
|
+
tools=[BENCH_TOOL],
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
result = run_inference(model, prompt, num_ctx=context_length, max_tokens=150)
|
|
139
|
+
|
|
140
|
+
elapsed = time.perf_counter() - t0
|
|
141
|
+
vram = tracker.stop()
|
|
142
|
+
|
|
143
|
+
if not result or "error" in result:
|
|
144
|
+
return BenchResult(
|
|
145
|
+
model=model, kv_type=kv_type, context_length=context_length,
|
|
146
|
+
prompt_type=prompt_type, prompt_tokens=0, generated_tokens=0,
|
|
147
|
+
prompt_eval_rate=0, eval_rate=0,
|
|
148
|
+
vram_baseline_mb=vram["baseline_mb"], vram_peak_mb=vram["peak_mb"],
|
|
149
|
+
vram_delta_mb=vram["delta_mb"], total_time_s=elapsed,
|
|
150
|
+
response_preview="", error=result.get("error", "Unknown error"),
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
response_text = result.get("response", "")
|
|
154
|
+
if not response_text and "message" in result:
|
|
155
|
+
msg = result["message"]
|
|
156
|
+
if isinstance(msg, dict):
|
|
157
|
+
response_text = msg.get("content", "")
|
|
158
|
+
if msg.get("tool_calls"):
|
|
159
|
+
response_text = json.dumps(msg["tool_calls"][0]["function"])
|
|
160
|
+
|
|
161
|
+
prompt_eval_count = result.get("prompt_eval_count", 0)
|
|
162
|
+
eval_count = result.get("eval_count", 0)
|
|
163
|
+
prompt_eval_dur = result.get("prompt_eval_duration", 1) / 1e9 # ns to s
|
|
164
|
+
eval_dur = result.get("eval_duration", 1) / 1e9
|
|
165
|
+
|
|
166
|
+
return BenchResult(
|
|
167
|
+
model=model,
|
|
168
|
+
kv_type=kv_type,
|
|
169
|
+
context_length=context_length,
|
|
170
|
+
prompt_type=prompt_type,
|
|
171
|
+
prompt_tokens=prompt_eval_count,
|
|
172
|
+
generated_tokens=eval_count,
|
|
173
|
+
prompt_eval_rate=round(prompt_eval_count / prompt_eval_dur, 1) if prompt_eval_dur > 0 else 0,
|
|
174
|
+
eval_rate=round(eval_count / eval_dur, 1) if eval_dur > 0 else 0,
|
|
175
|
+
vram_baseline_mb=vram["baseline_mb"],
|
|
176
|
+
vram_peak_mb=vram["peak_mb"],
|
|
177
|
+
vram_delta_mb=vram["delta_mb"],
|
|
178
|
+
total_time_s=round(elapsed, 2),
|
|
179
|
+
response_preview=response_text[:200],
|
|
180
|
+
correct=check_quality(prompt_type, response_text),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def run_full_benchmark(
|
|
185
|
+
model: str,
|
|
186
|
+
kv_types: list[str] = None,
|
|
187
|
+
context_lengths: list[int] = None,
|
|
188
|
+
prompt_types: list[str] = None,
|
|
189
|
+
auto_restart: bool = True,
|
|
190
|
+
) -> list[BenchResult]:
|
|
191
|
+
"""Run full benchmark matrix."""
|
|
192
|
+
if kv_types is None:
|
|
193
|
+
kv_types = KV_TYPES
|
|
194
|
+
if context_lengths is None:
|
|
195
|
+
context_lengths = [4096]
|
|
196
|
+
if prompt_types is None:
|
|
197
|
+
prompt_types = ["short", "code", "reasoning"]
|
|
198
|
+
|
|
199
|
+
results = []
|
|
200
|
+
gpu = detect_gpu()
|
|
201
|
+
|
|
202
|
+
print(f"\n{'='*70}")
|
|
203
|
+
print(f"KVCache-Bench v0.1.0")
|
|
204
|
+
print(f"{'='*70}")
|
|
205
|
+
if gpu:
|
|
206
|
+
print(f"GPU: {gpu.name} ({gpu.vram_total_mb} MB VRAM)")
|
|
207
|
+
print(f"Model: {model}")
|
|
208
|
+
print(f"KV types: {', '.join(kv_types)}")
|
|
209
|
+
print(f"Context lengths: {', '.join(str(c) for c in context_lengths)}")
|
|
210
|
+
print(f"Prompts: {', '.join(prompt_types)}")
|
|
211
|
+
print(f"{'='*70}\n")
|
|
212
|
+
|
|
213
|
+
for kv_type in kv_types:
|
|
214
|
+
print(f"\n--- KV type: {kv_type} ---")
|
|
215
|
+
|
|
216
|
+
if auto_restart:
|
|
217
|
+
print(f" Restarting Ollama with OLLAMA_KV_CACHE_TYPE={kv_type}...")
|
|
218
|
+
if not restart_ollama_with_kv(kv_type):
|
|
219
|
+
print(f" FAILED to restart Ollama. Skipping.")
|
|
220
|
+
continue
|
|
221
|
+
# Warm up: load model
|
|
222
|
+
print(f" Warming up model...")
|
|
223
|
+
run_inference(model, "Hi", num_ctx=512, max_tokens=1)
|
|
224
|
+
time.sleep(2)
|
|
225
|
+
|
|
226
|
+
for ctx in context_lengths:
|
|
227
|
+
for pt in prompt_types:
|
|
228
|
+
print(f" [{kv_type}] ctx={ctx}, prompt={pt}...", end=" ", flush=True)
|
|
229
|
+
result = run_single_bench(model, kv_type, pt, ctx)
|
|
230
|
+
results.append(result)
|
|
231
|
+
|
|
232
|
+
if result.error:
|
|
233
|
+
print(f"ERROR: {result.error}")
|
|
234
|
+
else:
|
|
235
|
+
quality = "?" if result.correct is None else ("PASS" if result.correct else "FAIL")
|
|
236
|
+
print(f"{result.eval_rate} tok/s, VRAM +{result.vram_delta_mb}MB, quality={quality}")
|
|
237
|
+
|
|
238
|
+
return results
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def format_results_table(results: list[BenchResult]) -> str:
|
|
242
|
+
"""Format results as a markdown table."""
|
|
243
|
+
lines = []
|
|
244
|
+
lines.append("")
|
|
245
|
+
lines.append(f"| KV Type | Context | Prompt | Gen tok/s | Prefill tok/s | VRAM +MB | Quality |")
|
|
246
|
+
lines.append(f"|---------|---------|--------|-----------|---------------|----------|---------|")
|
|
247
|
+
|
|
248
|
+
for r in results:
|
|
249
|
+
if r.error:
|
|
250
|
+
lines.append(f"| {r.kv_type} | {r.context_length} | {r.prompt_type} | ERROR | - | - | {r.error[:30]} |")
|
|
251
|
+
else:
|
|
252
|
+
q = "?" if r.correct is None else ("PASS" if r.correct else "FAIL")
|
|
253
|
+
lines.append(f"| {r.kv_type} | {r.context_length} | {r.prompt_type} | {r.eval_rate} | {r.prompt_eval_rate} | +{r.vram_delta_mb} | {q} |")
|
|
254
|
+
|
|
255
|
+
lines.append("")
|
|
256
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""CLI entry point: kvcache-bench command."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from kvcache_bench.gpu import detect_gpu
|
|
8
|
+
from kvcache_bench.ollama import check_ollama, list_models, KV_TYPES
|
|
9
|
+
from kvcache_bench.bench import run_full_benchmark, format_results_table
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
prog="kvcache-bench",
|
|
15
|
+
description="Benchmark every KV cache compression method on your GPU.",
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument("--model", "-m", help="Ollama model name (e.g., qwen3.5:9b)")
|
|
18
|
+
parser.add_argument("--context", "-c", default="4096",
|
|
19
|
+
help="Context lengths, comma-separated (default: 4096)")
|
|
20
|
+
parser.add_argument("--types", "-t", default=",".join(KV_TYPES),
|
|
21
|
+
help=f"KV cache types to test (default: {','.join(KV_TYPES)})")
|
|
22
|
+
parser.add_argument("--prompts", "-p", default="short,code,reasoning",
|
|
23
|
+
help="Prompt types: short,code,reasoning,long,tool_call")
|
|
24
|
+
parser.add_argument("--json", "-j", help="Save results to JSON file")
|
|
25
|
+
parser.add_argument("--no-restart", action="store_true",
|
|
26
|
+
help="Don't restart Ollama between tests (use current KV type)")
|
|
27
|
+
parser.add_argument("--list-models", action="store_true", help="List available Ollama models")
|
|
28
|
+
parser.add_argument("--gpu", action="store_true", help="Show GPU info and exit")
|
|
29
|
+
args = parser.parse_args()
|
|
30
|
+
|
|
31
|
+
# GPU info
|
|
32
|
+
if args.gpu:
|
|
33
|
+
gpu = detect_gpu()
|
|
34
|
+
if gpu:
|
|
35
|
+
print(f"GPU: {gpu.name}")
|
|
36
|
+
print(f"VRAM: {gpu.vram_used_mb}/{gpu.vram_total_mb} MB ({gpu.vram_free_mb} MB free)")
|
|
37
|
+
print(f"Driver: {gpu.driver_version}")
|
|
38
|
+
else:
|
|
39
|
+
print("No NVIDIA GPU detected.")
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
# Check Ollama
|
|
43
|
+
if not check_ollama():
|
|
44
|
+
print("Ollama is not running. Start it with: ollama serve")
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
# List models
|
|
48
|
+
if args.list_models:
|
|
49
|
+
models = list_models()
|
|
50
|
+
if models:
|
|
51
|
+
print("Available models:")
|
|
52
|
+
for m in models:
|
|
53
|
+
print(f" - {m}")
|
|
54
|
+
else:
|
|
55
|
+
print("No models found. Pull one with: ollama pull qwen3.5:9b")
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
# Need a model
|
|
59
|
+
if not args.model:
|
|
60
|
+
models = list_models()
|
|
61
|
+
if models:
|
|
62
|
+
args.model = models[0]
|
|
63
|
+
print(f"No model specified, using: {args.model}")
|
|
64
|
+
else:
|
|
65
|
+
print("No model specified and no models available.")
|
|
66
|
+
print("Usage: kvcache-bench --model qwen3.5:9b")
|
|
67
|
+
sys.exit(1)
|
|
68
|
+
|
|
69
|
+
# Parse args
|
|
70
|
+
context_lengths = [int(c) for c in args.context.split(",")]
|
|
71
|
+
kv_types = args.types.split(",")
|
|
72
|
+
prompt_types = args.prompts.split(",")
|
|
73
|
+
|
|
74
|
+
# Run
|
|
75
|
+
from dataclasses import asdict
|
|
76
|
+
results = run_full_benchmark(
|
|
77
|
+
model=args.model,
|
|
78
|
+
kv_types=kv_types,
|
|
79
|
+
context_lengths=context_lengths,
|
|
80
|
+
prompt_types=prompt_types,
|
|
81
|
+
auto_restart=not args.no_restart,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Output
|
|
85
|
+
table = format_results_table(results)
|
|
86
|
+
print(table)
|
|
87
|
+
|
|
88
|
+
# Save JSON
|
|
89
|
+
if args.json:
|
|
90
|
+
out_path = Path(args.json)
|
|
91
|
+
with open(out_path, "w") as f:
|
|
92
|
+
json.dump([asdict(r) for r in results], f, indent=2)
|
|
93
|
+
print(f"\nResults saved to {out_path}")
|
|
94
|
+
|
|
95
|
+
# Summary
|
|
96
|
+
if results:
|
|
97
|
+
gpu = detect_gpu()
|
|
98
|
+
print(f"\nGPU: {gpu.name if gpu else 'Unknown'}")
|
|
99
|
+
print(f"Model: {args.model}")
|
|
100
|
+
print(f"Tests: {len(results)} ({len(kv_types)} types x {len(context_lengths)} contexts x {len(prompt_types)} prompts)")
|
|
101
|
+
|
|
102
|
+
# Find best config
|
|
103
|
+
valid = [r for r in results if not r.error and r.eval_rate > 0]
|
|
104
|
+
if valid:
|
|
105
|
+
fastest = max(valid, key=lambda r: r.eval_rate)
|
|
106
|
+
lowest_vram = min(valid, key=lambda r: r.vram_delta_mb)
|
|
107
|
+
print(f"\nFastest: {fastest.kv_type} ({fastest.eval_rate} tok/s)")
|
|
108
|
+
print(f"Lowest VRAM: {lowest_vram.kv_type} (+{lowest_vram.vram_delta_mb} MB)")
|
|
109
|
+
|
|
110
|
+
# Recommendation
|
|
111
|
+
# q8_0 is usually the sweet spot (negligible quality loss, good VRAM savings)
|
|
112
|
+
q8_results = [r for r in valid if r.kv_type == "q8_0"]
|
|
113
|
+
if q8_results:
|
|
114
|
+
q8_avg_rate = sum(r.eval_rate for r in q8_results) / len(q8_results)
|
|
115
|
+
f16_results = [r for r in valid if r.kv_type == "f16"]
|
|
116
|
+
if f16_results:
|
|
117
|
+
f16_avg_rate = sum(r.eval_rate for r in f16_results) / len(f16_results)
|
|
118
|
+
speed_diff = abs(q8_avg_rate - f16_avg_rate) / f16_avg_rate * 100
|
|
119
|
+
if speed_diff < 5:
|
|
120
|
+
print(f"\nRecommendation: Use q8_0. Near-zero speed difference ({speed_diff:.1f}%) with 2x VRAM savings.")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
main()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""GPU detection and VRAM monitoring."""
|
|
2
|
+
|
|
3
|
+
import subprocess
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class GpuInfo:
|
|
11
|
+
name: str
|
|
12
|
+
vram_total_mb: int
|
|
13
|
+
vram_used_mb: int
|
|
14
|
+
vram_free_mb: int
|
|
15
|
+
driver_version: str
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def detect_gpu() -> Optional[GpuInfo]:
|
|
19
|
+
"""Detect NVIDIA GPU via nvidia-smi."""
|
|
20
|
+
try:
|
|
21
|
+
result = subprocess.run(
|
|
22
|
+
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free,driver_version",
|
|
23
|
+
"--format=csv,noheader,nounits"],
|
|
24
|
+
capture_output=True, text=True, timeout=5,
|
|
25
|
+
)
|
|
26
|
+
if result.returncode != 0:
|
|
27
|
+
return None
|
|
28
|
+
parts = result.stdout.strip().split(",")
|
|
29
|
+
if len(parts) < 5:
|
|
30
|
+
return None
|
|
31
|
+
return GpuInfo(
|
|
32
|
+
name=parts[0].strip(),
|
|
33
|
+
vram_total_mb=int(float(parts[1].strip())),
|
|
34
|
+
vram_used_mb=int(float(parts[2].strip())),
|
|
35
|
+
vram_free_mb=int(float(parts[3].strip())),
|
|
36
|
+
driver_version=parts[4].strip(),
|
|
37
|
+
)
|
|
38
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def measure_vram() -> int:
|
|
43
|
+
"""Get current VRAM usage in MB."""
|
|
44
|
+
try:
|
|
45
|
+
result = subprocess.run(
|
|
46
|
+
["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"],
|
|
47
|
+
capture_output=True, text=True, timeout=5,
|
|
48
|
+
)
|
|
49
|
+
return int(float(result.stdout.strip()))
|
|
50
|
+
except Exception:
|
|
51
|
+
return 0
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class VramTracker:
|
|
55
|
+
"""Track VRAM usage over a time period."""
|
|
56
|
+
|
|
57
|
+
def __init__(self):
|
|
58
|
+
self.baseline = 0
|
|
59
|
+
self.peak = 0
|
|
60
|
+
self._samples = []
|
|
61
|
+
|
|
62
|
+
def start(self):
|
|
63
|
+
self.baseline = measure_vram()
|
|
64
|
+
self.peak = self.baseline
|
|
65
|
+
self._samples = [self.baseline]
|
|
66
|
+
|
|
67
|
+
def sample(self):
|
|
68
|
+
v = measure_vram()
|
|
69
|
+
self._samples.append(v)
|
|
70
|
+
self.peak = max(self.peak, v)
|
|
71
|
+
|
|
72
|
+
def stop(self) -> dict:
|
|
73
|
+
self.sample()
|
|
74
|
+
return {
|
|
75
|
+
"baseline_mb": self.baseline,
|
|
76
|
+
"peak_mb": self.peak,
|
|
77
|
+
"delta_mb": self.peak - self.baseline,
|
|
78
|
+
"samples": len(self._samples),
|
|
79
|
+
}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Ollama backend: run inference with different KV cache types."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
import requests
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
OLLAMA_BASE = "http://localhost:11434"
|
|
11
|
+
|
|
12
|
+
# KV cache types supported by Ollama/llama.cpp
|
|
13
|
+
KV_TYPES = ["f16", "q8_0", "q4_0"]
|
|
14
|
+
|
|
15
|
+
# Mixed K/V configurations worth testing
|
|
16
|
+
MIXED_KV = [
|
|
17
|
+
("f16", "f16"), # Baseline
|
|
18
|
+
("q8_0", "q8_0"), # 8-bit both
|
|
19
|
+
("q4_0", "q4_0"), # 4-bit both
|
|
20
|
+
("q8_0", "q4_0"), # 8-bit keys, 4-bit values (sweet spot per research)
|
|
21
|
+
("q4_0", "q8_0"), # 4-bit keys, 8-bit values (keys are more sensitive)
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class OllamaResult:
|
|
27
|
+
model: str
|
|
28
|
+
kv_type_k: str
|
|
29
|
+
kv_type_v: str
|
|
30
|
+
context_length: int
|
|
31
|
+
prompt_tokens: int
|
|
32
|
+
generated_tokens: int
|
|
33
|
+
prompt_eval_rate: float # tok/s
|
|
34
|
+
eval_rate: float # tok/s
|
|
35
|
+
total_duration_ms: float
|
|
36
|
+
response_text: str = ""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def check_ollama() -> bool:
|
|
40
|
+
"""Check if Ollama is running."""
|
|
41
|
+
try:
|
|
42
|
+
r = requests.get(f"{OLLAMA_BASE}/", timeout=3)
|
|
43
|
+
return r.status_code == 200
|
|
44
|
+
except Exception:
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def list_models() -> list[str]:
|
|
49
|
+
"""List available Ollama models."""
|
|
50
|
+
try:
|
|
51
|
+
r = requests.get(f"{OLLAMA_BASE}/api/tags", timeout=5)
|
|
52
|
+
data = r.json()
|
|
53
|
+
return [m["name"] for m in data.get("models", [])]
|
|
54
|
+
except Exception:
|
|
55
|
+
return []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def run_inference(
|
|
59
|
+
model: str,
|
|
60
|
+
prompt: str,
|
|
61
|
+
num_ctx: int = 4096,
|
|
62
|
+
max_tokens: int = 100,
|
|
63
|
+
temperature: float = 0.0,
|
|
64
|
+
) -> Optional[dict]:
|
|
65
|
+
"""Run a single inference call via Ollama API."""
|
|
66
|
+
try:
|
|
67
|
+
r = requests.post(
|
|
68
|
+
f"{OLLAMA_BASE}/api/generate",
|
|
69
|
+
json={
|
|
70
|
+
"model": model,
|
|
71
|
+
"prompt": prompt,
|
|
72
|
+
"stream": False,
|
|
73
|
+
"options": {
|
|
74
|
+
"num_ctx": num_ctx,
|
|
75
|
+
"num_predict": max_tokens,
|
|
76
|
+
"temperature": temperature,
|
|
77
|
+
},
|
|
78
|
+
},
|
|
79
|
+
timeout=300,
|
|
80
|
+
)
|
|
81
|
+
return r.json()
|
|
82
|
+
except Exception as e:
|
|
83
|
+
return {"error": str(e)}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def run_chat(
|
|
87
|
+
model: str,
|
|
88
|
+
messages: list[dict],
|
|
89
|
+
num_ctx: int = 4096,
|
|
90
|
+
max_tokens: int = 100,
|
|
91
|
+
tools: Optional[list] = None,
|
|
92
|
+
) -> Optional[dict]:
|
|
93
|
+
"""Run a chat completion via Ollama API."""
|
|
94
|
+
body = {
|
|
95
|
+
"model": model,
|
|
96
|
+
"messages": messages,
|
|
97
|
+
"stream": False,
|
|
98
|
+
"options": {
|
|
99
|
+
"num_ctx": num_ctx,
|
|
100
|
+
"num_predict": max_tokens,
|
|
101
|
+
"temperature": 0.0,
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
if tools:
|
|
105
|
+
body["tools"] = tools
|
|
106
|
+
try:
|
|
107
|
+
r = requests.post(f"{OLLAMA_BASE}/api/chat", json=body, timeout=300)
|
|
108
|
+
return r.json()
|
|
109
|
+
except Exception as e:
|
|
110
|
+
return {"error": str(e)}
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Standard prompts for benchmarking
|
|
114
|
+
BENCH_PROMPTS = {
|
|
115
|
+
"short": "What is the capital of France? Answer in one word.",
|
|
116
|
+
"code": "Write a Python function to check if a number is prime. Just the function, no explanation.",
|
|
117
|
+
"long": "You are an expert software engineer. " * 100 + "\n\nAnalyze the time complexity of binary search and explain why it's O(log n).",
|
|
118
|
+
"reasoning": "A farmer has 17 sheep. All but 9 die. How many sheep are left? Think step by step.",
|
|
119
|
+
"tool_call": "Get the current weather in Tokyo.",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
BENCH_TOOL = {
|
|
123
|
+
"type": "function",
|
|
124
|
+
"function": {
|
|
125
|
+
"name": "get_weather",
|
|
126
|
+
"description": "Get current weather for a location",
|
|
127
|
+
"parameters": {
|
|
128
|
+
"type": "object",
|
|
129
|
+
"properties": {"location": {"type": "string", "description": "City name"}},
|
|
130
|
+
"required": ["location"],
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: kvcache-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Benchmark every KV cache compression method on your GPU. One command, real numbers.
|
|
5
|
+
Author: back2matching
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/back2matching/kvcache-bench
|
|
8
|
+
Keywords: llm,kv-cache,benchmark,vram,gpu,ollama,llama-cpp,quantization
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: requests>=2.28.0
|
|
17
|
+
Requires-Dist: pynvml>=11.5.0
|
|
18
|
+
Provides-Extra: charts
|
|
19
|
+
Requires-Dist: matplotlib; extra == "charts"
|
|
20
|
+
Provides-Extra: hf
|
|
21
|
+
Requires-Dist: torch; extra == "hf"
|
|
22
|
+
Requires-Dist: transformers; extra == "hf"
|
|
23
|
+
Dynamic: license-file
|
|
24
|
+
|
|
25
|
+
# kvcache-bench
|
|
26
|
+
|
|
27
|
+
Benchmark every KV cache compression method on your GPU. One command, real numbers.
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
kvcache-bench --model qwen3.5:9b
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
| KV Type | Context | Prompt | Gen tok/s | Prefill tok/s | VRAM +MB | Quality |
|
|
35
|
+
|---------|---------|-----------|-----------|---------------|----------|---------|
|
|
36
|
+
| f16 | 4096 | short | 80.1 | 712.3 | +142 | PASS |
|
|
37
|
+
| q8_0 | 4096 | short | 79.5 | 723.5 | +71 | PASS |
|
|
38
|
+
| q4_0 | 4096 | short | 78.2 | 698.1 | +36 | PASS |
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Why
|
|
42
|
+
|
|
43
|
+
When you run a local LLM, the KV cache eats your VRAM. Ollama and llama.cpp support different KV cache quantization types (f16, q8_0, q4_0), but nobody tells you what the actual tradeoff is on YOUR hardware.
|
|
44
|
+
|
|
45
|
+
Current state of the world:
|
|
46
|
+
- You Google "ollama kv cache quantization" and find forum posts with conflicting advice
|
|
47
|
+
- You manually test each config, eyeball nvidia-smi, and guess
|
|
48
|
+
- No tool compares them systematically
|
|
49
|
+
|
|
50
|
+
kvcache-bench fixes this. It tests every KV cache type on your GPU and gives you a comparison table with speed, VRAM, and quality.
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install kvcache-bench
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Usage
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# Auto-detect your first model, test all KV types
|
|
62
|
+
kvcache-bench
|
|
63
|
+
|
|
64
|
+
# Specific model
|
|
65
|
+
kvcache-bench --model qwen3.5:9b
|
|
66
|
+
|
|
67
|
+
# Test at multiple context lengths (where KV savings matter most)
|
|
68
|
+
kvcache-bench --model llama3.1:8b --context 4096,8192,16384
|
|
69
|
+
|
|
70
|
+
# Include tool calling test
|
|
71
|
+
kvcache-bench --model qwen3.5:9b --prompts short,code,reasoning,tool_call
|
|
72
|
+
|
|
73
|
+
# Save results as JSON
|
|
74
|
+
kvcache-bench --model qwen3.5:9b --json results.json
|
|
75
|
+
|
|
76
|
+
# Just show GPU info
|
|
77
|
+
kvcache-bench --gpu
|
|
78
|
+
|
|
79
|
+
# List available models
|
|
80
|
+
kvcache-bench --list-models
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## What It Tests
|
|
84
|
+
|
|
85
|
+
For each KV cache type (f16, q8_0, q4_0), it measures:
|
|
86
|
+
|
|
87
|
+
| Metric | How |
|
|
88
|
+
|--------|-----|
|
|
89
|
+
| **Generation speed** | Tokens per second during generation |
|
|
90
|
+
| **Prefill speed** | Tokens per second processing the prompt |
|
|
91
|
+
| **VRAM delta** | Extra VRAM used beyond model weights (measured via nvidia-smi) |
|
|
92
|
+
| **Quality** | Auto-checked against expected answers (Paris, code structure, reasoning) |
|
|
93
|
+
|
|
94
|
+
## How It Works
|
|
95
|
+
|
|
96
|
+
1. Detects your GPU and Ollama installation
|
|
97
|
+
2. For each KV cache type: restarts Ollama with `OLLAMA_KV_CACHE_TYPE=<type>`, warms up the model, runs benchmark prompts
|
|
98
|
+
3. Measures VRAM before and during inference via nvidia-smi
|
|
99
|
+
4. Extracts timing from Ollama's API response (prompt_eval_duration, eval_duration)
|
|
100
|
+
5. Checks response quality with simple auto-graders
|
|
101
|
+
6. Produces a markdown table (and optional JSON)
|
|
102
|
+
|
|
103
|
+
## What the Research Says
|
|
104
|
+
|
|
105
|
+
Based on llama.cpp community benchmarks and our testing:
|
|
106
|
+
|
|
107
|
+
| KV Type | VRAM Savings | Perplexity Impact | Best For |
|
|
108
|
+
|---------|-------------|-------------------|----------|
|
|
109
|
+
| f16 | Baseline | None | When you have VRAM to spare |
|
|
110
|
+
| q8_0 | 2x | +0.004 (negligible) | **Default recommendation.** Free VRAM, zero quality cost. |
|
|
111
|
+
| q4_0 | 4x | +0.2 (noticeable) | When you need max context length or are VRAM-constrained |
|
|
112
|
+
|
|
113
|
+
The sweet spot for most users: **q8_0**. Halves your KV cache VRAM with essentially zero quality loss.
|
|
114
|
+
|
|
115
|
+
## Requirements
|
|
116
|
+
|
|
117
|
+
- Python 3.10+
|
|
118
|
+
- NVIDIA GPU with nvidia-smi
|
|
119
|
+
- Ollama installed and running
|
|
120
|
+
|
|
121
|
+
## Roadmap
|
|
122
|
+
|
|
123
|
+
- [ ] Mixed K/V types (q8 keys + q4 values)
|
|
124
|
+
- [ ] Context length sweep charts
|
|
125
|
+
- [ ] HuggingFace backend (vLLM, TGI)
|
|
126
|
+
- [ ] TurboQuant integration
|
|
127
|
+
- [ ] Multi-model matrix
|
|
128
|
+
- [ ] HuggingFace Spaces leaderboard
|
|
129
|
+
- [ ] Community result submissions
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
Apache 2.0
|
|
134
|
+
|
|
135
|
+
## Related
|
|
136
|
+
|
|
137
|
+
- [turboquant](https://github.com/back2matching/turboquant) -- TurboQuant KV cache compression (sub-4-bit)
|
|
138
|
+
- [NVIDIA kvpress](https://github.com/NVIDIA/kvpress) -- KV cache eviction/pruning methods
|
|
139
|
+
- [llama.cpp](https://github.com/ggml-org/llama.cpp) -- Where KV cache quantization lives
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
kvcache_bench/__init__.py
|
|
5
|
+
kvcache_bench/bench.py
|
|
6
|
+
kvcache_bench/cli.py
|
|
7
|
+
kvcache_bench/gpu.py
|
|
8
|
+
kvcache_bench/ollama.py
|
|
9
|
+
kvcache_bench.egg-info/PKG-INFO
|
|
10
|
+
kvcache_bench.egg-info/SOURCES.txt
|
|
11
|
+
kvcache_bench.egg-info/dependency_links.txt
|
|
12
|
+
kvcache_bench.egg-info/entry_points.txt
|
|
13
|
+
kvcache_bench.egg-info/requires.txt
|
|
14
|
+
kvcache_bench.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
kvcache_bench
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "kvcache-bench"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Benchmark every KV cache compression method on your GPU. One command, real numbers."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "Apache-2.0"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{name = "back2matching"}]
|
|
13
|
+
keywords = ["llm", "kv-cache", "benchmark", "vram", "gpu", "ollama", "llama-cpp", "quantization"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"License :: OSI Approved :: Apache Software License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
19
|
+
]
|
|
20
|
+
dependencies = ["requests>=2.28.0", "pynvml>=11.5.0"]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
charts = ["matplotlib"]
|
|
24
|
+
hf = ["torch", "transformers"]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/back2matching/kvcache-bench"
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
kvcache-bench = "kvcache_bench.cli:main"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
include = ["kvcache_bench*"]
|