cupel 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cupel-0.1.0/.gitignore +170 -0
- cupel-0.1.0/PKG-INFO +471 -0
- cupel-0.1.0/README.md +446 -0
- cupel-0.1.0/_env.example +21 -0
- cupel-0.1.0/config.example.yml +84 -0
- cupel-0.1.0/config.yml +84 -0
- cupel-0.1.0/cupel/__init__.py +2 -0
- cupel-0.1.0/cupel/__main__.py +2 -0
- cupel-0.1.0/cupel/cli.py +509 -0
- cupel-0.1.0/cupel/config.py +99 -0
- cupel-0.1.0/cupel/data/eval-set.example.json +227 -0
- cupel-0.1.0/cupel/data/example-run.json +543 -0
- cupel-0.1.0/cupel/data/starter-eval-set.json +142 -0
- cupel-0.1.0/cupel/discovery.py +146 -0
- cupel-0.1.0/cupel/display.py +76 -0
- cupel-0.1.0/cupel/eval.py +513 -0
- cupel-0.1.0/cupel/server.py +886 -0
- cupel-0.1.0/cupel/ui/index.html +135 -0
- cupel-0.1.0/cupel/ui/pages/author.js +106 -0
- cupel-0.1.0/cupel/ui/pages/dashboard.js +434 -0
- cupel-0.1.0/cupel/ui/pages/evalset.js +89 -0
- cupel-0.1.0/cupel/ui/pages/results.js +74 -0
- cupel-0.1.0/cupel/ui/pages/run.js +330 -0
- cupel-0.1.0/cupel/ui/pages/settings.js +99 -0
- cupel-0.1.0/cupel/ui/style.css +943 -0
- cupel-0.1.0/eval-sets/what-am-i-looking-at.png +0 -0
- cupel-0.1.0/pyproject.toml +42 -0
cupel-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# poetry
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
102
|
+
poetry.lock
|
|
103
|
+
|
|
104
|
+
# pdm
|
|
105
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
106
|
+
#pdm.lock
|
|
107
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
108
|
+
# in version control.
|
|
109
|
+
# https://pdm.fming.dev/#use-with-ide
|
|
110
|
+
.pdm.toml
|
|
111
|
+
|
|
112
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
113
|
+
__pypackages__/
|
|
114
|
+
|
|
115
|
+
# Celery stuff
|
|
116
|
+
celerybeat-schedule
|
|
117
|
+
celerybeat.pid
|
|
118
|
+
|
|
119
|
+
# SageMath parsed files
|
|
120
|
+
*.sage.py
|
|
121
|
+
|
|
122
|
+
# Environments
|
|
123
|
+
.env
|
|
124
|
+
.venv
|
|
125
|
+
env/
|
|
126
|
+
venv/
|
|
127
|
+
ENV/
|
|
128
|
+
env.bak/
|
|
129
|
+
venv.bak/
|
|
130
|
+
|
|
131
|
+
# Spyder project settings
|
|
132
|
+
.spyderproject
|
|
133
|
+
.spyproject
|
|
134
|
+
|
|
135
|
+
# Rope project settings
|
|
136
|
+
.ropeproject
|
|
137
|
+
|
|
138
|
+
# mkdocs documentation
|
|
139
|
+
/site
|
|
140
|
+
|
|
141
|
+
# mypy
|
|
142
|
+
.mypy_cache/
|
|
143
|
+
.dmypy.json
|
|
144
|
+
dmypy.json
|
|
145
|
+
|
|
146
|
+
# Pyre type checker
|
|
147
|
+
.pyre/
|
|
148
|
+
|
|
149
|
+
# pytype static type analyzer
|
|
150
|
+
.pytype/
|
|
151
|
+
|
|
152
|
+
src/lab/
|
|
153
|
+
hyperland/lab/
|
|
154
|
+
|
|
155
|
+
# Cython debug symbols
|
|
156
|
+
cython_debug/
|
|
157
|
+
|
|
158
|
+
# PyCharm
|
|
159
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
160
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
161
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
162
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
163
|
+
#.idea/
|
|
164
|
+
|
|
165
|
+
.claude
|
|
166
|
+
prototype/
|
|
167
|
+
requirements/
|
|
168
|
+
.DS_Store
|
|
169
|
+
eval-results/
|
|
170
|
+
eval-set.json
|
cupel-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cupel
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: separates precious LLMs from base LLMs. works with any OpenAI/Anthropic compatible API
|
|
5
|
+
Project-URL: Homepage, https://github.com/tolitius/cupel
|
|
6
|
+
Project-URL: Repository, https://github.com/tolitius/cupel
|
|
7
|
+
Author: tolitius
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: benchmark,eval,leaderboard,llm,local-llm
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: fastapi>=0.110
|
|
20
|
+
Requires-Dist: pyyaml>=6.0
|
|
21
|
+
Requires-Dist: requests>=2.31
|
|
22
|
+
Requires-Dist: rich>=13.0
|
|
23
|
+
Requires-Dist: uvicorn>=0.27
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# bench
|
|
27
|
+
|
|
28
|
+
> run prompts against local LLMs, then let a judge score them
|
|
29
|
+
|
|
30
|
+
bench sends the same set of prompts to multiple models running on any OpenAI-compatible API,<br/>
|
|
31
|
+
collects responses (with thinking separated from the answer), and then scores them using a configurable judge — which can be another local model, Claude, GPT, or anything with an API.
|
|
32
|
+
|
|
33
|
+
- [what it does](#what-it-does)
|
|
34
|
+
- [install](#install)
|
|
35
|
+
- [run it](#run-it)
|
|
36
|
+
- [collect responses](#collect-responses)
|
|
37
|
+
- [run specific prompts](#run-specific-prompts)
|
|
38
|
+
- [score them](#score-them)
|
|
39
|
+
- [judge with Claude](#judge-with-claude)
|
|
40
|
+
- [judge with a local model](#judge-with-a-local-model)
|
|
41
|
+
- [config](#config)
|
|
42
|
+
- [models and inference](#models-and-inference)
|
|
43
|
+
- [judge](#judge)
|
|
44
|
+
- [endpoint and keys](#endpoint-and-keys)
|
|
45
|
+
- [the eval set](#the-eval-set)
|
|
46
|
+
- [prompts](#prompts)
|
|
47
|
+
- [multi-turn prompts](#multi-turn-prompts)
|
|
48
|
+
- [writing your own](#writing-your-own)
|
|
49
|
+
- [thinking models](#thinking-models)
|
|
50
|
+
- [license](#license)
|
|
51
|
+
|
|
52
|
+
# what it does
|
|
53
|
+
|
|
54
|
+
you have a few local models. you want to know which one is actually good at the things _you_ care about — not MMLU, not HumanEval, but your questions, your domain, your work.
|
|
55
|
+
|
|
56
|
+
bench lets you:
|
|
57
|
+
|
|
58
|
+
1. **define prompts** with scoring rubrics in a JSON file
|
|
59
|
+
2. **run** them against multiple models in one shot
|
|
60
|
+
3. **judge** the responses with any model (local or API)
|
|
61
|
+
4. **compare** scores side by side with response times
|
|
62
|
+
|
|
63
|
+
the judge and the models under test don't have to be on the same server.<br/>
|
|
64
|
+
you can collect responses locally and score them with Claude later, or vice versa.
|
|
65
|
+
|
|
66
|
+
# install
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
$ git clone https://github.com/tolitius/bench.git
|
|
70
|
+
$ cd bench
|
|
71
|
+
$ pip install .
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
this installs three dependencies: `requests`, `pyyaml`, `rich`
|
|
75
|
+
|
|
76
|
+
alternatively:
|
|
77
|
+
```bash
|
|
78
|
+
$ pip install -r requirements.txt
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
if you skip the install and just run `python eval.py`, it will tell you what's missing:
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
missing required packages: pyyaml
|
|
85
|
+
python: /usr/local/bin/python3.12 (3.12.4)
|
|
86
|
+
|
|
87
|
+
install with one of:
|
|
88
|
+
/usr/local/bin/python3.12 -m pip install . # from repo root
|
|
89
|
+
/usr/local/bin/python3.12 -m pip install -r requirements.txt # alternative
|
|
90
|
+
/usr/local/bin/python3.12 -m pip install pyyaml
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
it shows the exact python path so you don't end up installing packages into the wrong interpreter.
|
|
94
|
+
|
|
95
|
+
# run it
|
|
96
|
+
|
|
97
|
+
bench has two commands: `run` and `judge`
|
|
98
|
+
|
|
99
|
+
## collect responses
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
$ python eval.py run
|
|
103
|
+
```
|
|
104
|
+
```
|
|
105
|
+
config: config.yml
|
|
106
|
+
.env: .env
|
|
107
|
+
endpoint: localhost:8000
|
|
108
|
+
eval set: My Eval Set (12 prompts)
|
|
109
|
+
models: Qwen3.5-35B-A3B-4bit, Qwen3.5-9B-bf16, Qwen3.5-27B-8bit
|
|
110
|
+
thinking: model default
|
|
111
|
+
output: ./eval-results
|
|
112
|
+
|
|
113
|
+
⚡ Local LLM Eval @ localhost:8000
|
|
114
|
+
# Prompt Q35B·4b Q9B·bf16 Q27B·8b
|
|
115
|
+
1 Image Description 19.65s 68.97s 32.4s
|
|
116
|
+
2 Explain a Sorting Algorithm 16.65s 67.06s 28.1s
|
|
117
|
+
...
|
|
118
|
+
Done 12/12 12/12 12/12
|
|
119
|
+
|
|
120
|
+
Saved: eval-results/eval_Qwen3.5-35B-A3B-4bit_20260321_194308.json
|
|
121
|
+
Saved: eval-results/eval_Qwen3.5-9B-bf16_20260321_194308.json
|
|
122
|
+
Saved: eval-results/eval_Qwen3.5-27B-8bit_20260321_194308.json
|
|
123
|
+
|
|
124
|
+
✅ Run complete. To score:
|
|
125
|
+
python eval.py judge eval-results/eval_*.json
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
each result JSON has the prompt, the model's response, thinking (if any), and timing.<br/>
|
|
129
|
+
no scoring happens here — just collection.
|
|
130
|
+
|
|
131
|
+
## run specific prompts
|
|
132
|
+
|
|
133
|
+
don't want to wait through all 22 prompts to retest the 3 you just changed? use `--prompts`:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
$ python eval.py run --prompts 18-22 # range
|
|
137
|
+
$ python eval.py run --prompts 21,22 # specific IDs
|
|
138
|
+
$ python eval.py run --prompts 1,18-22 # mix of both
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
works with `--models` too:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
$ python eval.py run --prompts 21,22 --models gpt-oss-120b-MXFP4-Q8
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## score them
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
$ python eval.py judge eval-results/eval_*.json
|
|
151
|
+
```
|
|
152
|
+
```
|
|
153
|
+
judge: Qwen3.5-27B-8bit @ localhost:8000
|
|
154
|
+
scoring: 3 file(s), 12 prompts each
|
|
155
|
+
|
|
156
|
+
⚖ Scoring @ localhost:8000
|
|
157
|
+
# Prompt Q35B·4b Q9B·bf16 Q27B·8b
|
|
158
|
+
1 Image Description 3 (19.65s) 2 (68.97s) 3 (32.4s)
|
|
159
|
+
2 Explain a Sorting Algorithm 2 (16.65s) 1 (67.06s) 3 (28.1s)
|
|
160
|
+
...
|
|
161
|
+
Score 38/51 24/51 41/51
|
|
162
|
+
|
|
163
|
+
Summary: eval-results/scoring_Qwen3.5-27B-8bit_20260321.md
|
|
164
|
+
|
|
165
|
+
Qwen3.5-35B-A3B-4bit 38/51
|
|
166
|
+
Qwen3.5-9B-bf16 24/51
|
|
167
|
+
Qwen3.5-27B-8bit 41/51
|
|
168
|
+
|
|
169
|
+
✅ Judging complete.
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
the judge sends each response + its rubric to the judge model, which returns a 0-3 score:
|
|
173
|
+
|
|
174
|
+
| score | meaning |
|
|
175
|
+
| --- | --- |
|
|
176
|
+
| 0 | wrong or hallucinated |
|
|
177
|
+
| 1 | partially correct |
|
|
178
|
+
| 2 | correct but shallow |
|
|
179
|
+
| 3 | correct and insightful |
|
|
180
|
+
|
|
181
|
+
scores are written back into the result JSONs and a markdown summary is generated.
|
|
182
|
+
|
|
183
|
+
## judge with Claude
|
|
184
|
+
|
|
185
|
+
point the judge at Anthropic's API directly:
|
|
186
|
+
|
|
187
|
+
```yaml
|
|
188
|
+
# config.yml
|
|
189
|
+
judge:
|
|
190
|
+
model: claude-sonnet-4-20250514
|
|
191
|
+
api_url: https://api.anthropic.com/v1/messages
|
|
192
|
+
api_key_env: ANTHROPIC_API_KEY
|
|
193
|
+
```
|
|
194
|
+
```bash
|
|
195
|
+
# .env
|
|
196
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
197
|
+
```
|
|
198
|
+
```bash
|
|
199
|
+
$ python eval.py judge eval-results/eval_*.json
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
bench detects `api.anthropic.com` and uses the correct auth header (`x-api-key`) and response format automatically.
|
|
203
|
+
|
|
204
|
+
## judge with a local model
|
|
205
|
+
|
|
206
|
+
```yaml
|
|
207
|
+
# config.yml
|
|
208
|
+
judge:
|
|
209
|
+
model: Qwen3.5-27B-8bit
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
when `api_url` is omitted, the judge uses the same endpoint from `.env` as the eval run.
|
|
213
|
+
|
|
214
|
+
you can also override the judge model from the command line:
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
$ python eval.py judge eval-results/eval_*.json --judge-model Qwen3.5-122B-A10B-4bit
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
or point at a completely different server:
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
$ python eval.py judge eval-results/eval_*.json \
|
|
224
|
+
--judge-model gpt-4o \
|
|
225
|
+
--judge-url https://api.openai.com/v1/chat/completions \
|
|
226
|
+
--judge-key-env OPENAI_API_KEY
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
# config
|
|
230
|
+
|
|
231
|
+
## models and inference
|
|
232
|
+
|
|
233
|
+
```yaml
|
|
234
|
+
# config.yml
|
|
235
|
+
|
|
236
|
+
models:
|
|
237
|
+
- Qwen3.5-35B-A3B-4bit
|
|
238
|
+
- Qwen3.5-9B-bf16
|
|
239
|
+
- Qwen3.5-122B-A10B-4bit
|
|
240
|
+
- Qwen3.5-27B-8bit
|
|
241
|
+
- Nemotron-Cascade-2-30B-A3B-8bit
|
|
242
|
+
|
|
243
|
+
eval_set: eval-set.json
|
|
244
|
+
output_dir: ./eval-results
|
|
245
|
+
temperature: 0
|
|
246
|
+
max_tokens: 16384
|
|
247
|
+
thinking: null # null = model default, 0 = off, 4096 = explicit budget
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
`max_tokens` covers _both_ thinking and the response.<br/>
|
|
251
|
+
Qwen3.5 models can burn 2-4K tokens thinking before answering, so 16384 gives plenty of room.
|
|
252
|
+
|
|
253
|
+
## judge
|
|
254
|
+
|
|
255
|
+
```yaml
|
|
256
|
+
# config.yml
|
|
257
|
+
|
|
258
|
+
judge:
|
|
259
|
+
model: claude-sonnet-4-20250514
|
|
260
|
+
api_url: https://api.anthropic.com/v1/messages
|
|
261
|
+
api_key_env: ANTHROPIC_API_KEY
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
| field | what it does | default |
|
|
265
|
+
| --- | --- | --- |
|
|
266
|
+
| `model` | which model scores responses | _(required)_ |
|
|
267
|
+
| `api_url` | endpoint for the judge | reuses `LLM_API_URL` from `.env` |
|
|
268
|
+
| `api_key_env` | env var name holding the key | reuses `LLM_API_KEY` from `.env` |
|
|
269
|
+
|
|
270
|
+
some examples:
|
|
271
|
+
|
|
272
|
+
```yaml
|
|
273
|
+
# local (same server as eval)
|
|
274
|
+
judge:
|
|
275
|
+
model: Qwen3.5-27B-8bit
|
|
276
|
+
|
|
277
|
+
# OpenAI
|
|
278
|
+
judge:
|
|
279
|
+
model: gpt-4o
|
|
280
|
+
api_url: https://api.openai.com/v1/chat/completions
|
|
281
|
+
api_key_env: OPENAI_API_KEY
|
|
282
|
+
|
|
283
|
+
# Anthropic (direct)
|
|
284
|
+
judge:
|
|
285
|
+
model: claude-sonnet-4-20250514
|
|
286
|
+
api_url: https://api.anthropic.com/v1/messages
|
|
287
|
+
api_key_env: ANTHROPIC_API_KEY
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
## endpoint and keys
|
|
291
|
+
|
|
292
|
+
```bash
|
|
293
|
+
# .env
|
|
294
|
+
|
|
295
|
+
LLM_API_URL=http://localhost:8000/v1/chat/completions
|
|
296
|
+
LLM_API_KEY=2893692
|
|
297
|
+
|
|
298
|
+
# only if judge uses a different API:
|
|
299
|
+
# OPENAI_API_KEY=sk-proj-...
|
|
300
|
+
# ANTHROPIC_API_KEY=sk-ant-...
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
works with any OpenAI-compatible server: oMLX, Ollama, LM Studio, vLLM, SGLang, llama.cpp
|
|
304
|
+
|
|
305
|
+
env vars override `.env` file values, so you can do one-off overrides:
|
|
306
|
+
```bash
|
|
307
|
+
$ LLM_API_URL=http://other-server:8000/v1/chat/completions python eval.py run
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
# the eval set
|
|
311
|
+
|
|
312
|
+
## prompts
|
|
313
|
+
|
|
314
|
+
the repo ships with a sample `eval-set-sample.json` to show the format. the idea is that you write your own — prompts that test what _you_ actually care about.
|
|
315
|
+
|
|
316
|
+
good prompts for a personal eval set:
|
|
317
|
+
|
|
318
|
+
| category | example |
|
|
319
|
+
| --- | --- |
|
|
320
|
+
| multimodal | show an image, ask "what is this?" |
|
|
321
|
+
| code reading | paste a function, ask what it does and how to improve it |
|
|
322
|
+
| system design | "design an X that does Y given these constraints" |
|
|
323
|
+
| debugging | describe symptoms, ask for root cause in priority order |
|
|
324
|
+
| domain knowledge | something only someone in your field would know well |
|
|
325
|
+
| math / estimation | a calculation with a verifiable answer |
|
|
326
|
+
| architecture | "compare X vs Y, when would you choose each?" |
|
|
327
|
+
| business logic | a tricky edge case from your actual work |
|
|
328
|
+
| assistant competence | give structured data about multiple entities, test if the model keeps them straight |
|
|
329
|
+
| tool calling | define tools in a system prompt, test if the model emits correct JSON calls |
|
|
330
|
+
|
|
331
|
+
a few principles that make prompts work well for eval:
|
|
332
|
+
|
|
333
|
+
- **self-contained** — no hidden context, no prior conversation
|
|
334
|
+
- **verifiable** — you can tell if the answer is right or wrong
|
|
335
|
+
- **no tools needed** — no web search, no code execution (unless testing tool-call _format_)
|
|
336
|
+
- **stable** — the answer shouldn't change next month
|
|
337
|
+
|
|
338
|
+
each prompt includes a rubric that the judge uses:
|
|
339
|
+
|
|
340
|
+
```json
|
|
341
|
+
{
|
|
342
|
+
"id": 1,
|
|
343
|
+
"category": "math_estimation",
|
|
344
|
+
"title": "Estimate Memory from Quantization",
|
|
345
|
+
"prompt": "A model has 7 billion parameters. Estimate the memory footprint for FP16, 8-bit, and 4-bit.",
|
|
346
|
+
"rubric": {
|
|
347
|
+
"3": "FP16: ~14GB, 8-bit: ~7GB, 4-bit: ~3.5GB. Shows the math.",
|
|
348
|
+
"2": "Correct for 2 of 3, or all correct but no explanation.",
|
|
349
|
+
"1": "Gets the direction right but wrong numbers.",
|
|
350
|
+
"0": "Wrong math or doesn't understand quantization."
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
```
|
|
354
|
+
|
|
355
|
+
## multi-turn prompts
|
|
356
|
+
|
|
357
|
+
not every capability shows up in a single Q&A turn. some things — tool calling, context shifts, follow-up questions — need a conversation.
|
|
358
|
+
|
|
359
|
+
multi-turn prompts use `turns` instead of `prompt`:
|
|
360
|
+
|
|
361
|
+
```json
|
|
362
|
+
{
|
|
363
|
+
"id": 21,
|
|
364
|
+
"category": "assistant_competence",
|
|
365
|
+
"title": "Tool Calling — School Status Check",
|
|
366
|
+
"turns": [
|
|
367
|
+
{
|
|
368
|
+
"messages": [
|
|
369
|
+
{"role": "system", "content": "You are an assistant with these tools: ..."},
|
|
370
|
+
{"role": "user", "content": "How are both kids doing?"}
|
|
371
|
+
]
|
|
372
|
+
},
|
|
373
|
+
{
|
|
374
|
+
"inject_after": [
|
|
375
|
+
{"role": "user", "content": "Tool results:\n\nget_grades(\"phoebe\") => ..."}
|
|
376
|
+
],
|
|
377
|
+
"messages": []
|
|
378
|
+
},
|
|
379
|
+
{
|
|
380
|
+
"messages": [
|
|
381
|
+
{"role": "user", "content": "Now check last week's attendance too."}
|
|
382
|
+
]
|
|
383
|
+
}
|
|
384
|
+
],
|
|
385
|
+
"rubric": {
|
|
386
|
+
"3": "Turn 1 emits correct tool calls as JSON. Turn 2 synthesizes results accurately. ...",
|
|
387
|
+
"2": "...",
|
|
388
|
+
"1": "...",
|
|
389
|
+
"0": "..."
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
each turn has:
|
|
395
|
+
|
|
396
|
+
- **`messages`** — added to history before calling the model. if the last message is from a user, the model responds.
|
|
397
|
+
- **`inject_after`** — added to history _after_ the model responds (or immediately if there are no messages). if the injected messages end with a user message, the model responds again.
|
|
398
|
+
|
|
399
|
+
this lets you simulate tool-call workflows: the model emits tool calls, you inject fake results, the model synthesizes — all scored by the judge at the end.
|
|
400
|
+
|
|
401
|
+
the judge sees the full conversation transcript (all turns, all model responses) when scoring.
|
|
402
|
+
|
|
403
|
+
**when to use multi-turn vs single-turn:**
|
|
404
|
+
|
|
405
|
+
a model that aces single-turn knowledge questions can still fail badly as an assistant. multi-turn prompts catch things like:
|
|
406
|
+
|
|
407
|
+
- can it emit tool calls in the format you asked for, or does it fabricate data?
|
|
408
|
+
- can it stay coherent across turns without mixing up entities?
|
|
409
|
+
- does it follow workflow rules (check health _before_ investigating, don't rollback without approval)?
|
|
410
|
+
|
|
411
|
+
if you're evaluating models for assistant / agent use, you probably want both.
|
|
412
|
+
|
|
413
|
+
## writing your own
|
|
414
|
+
|
|
415
|
+
create an `eval-set.json` following the same structure. copy `eval-set-sample.json` as a starting point:
|
|
416
|
+
|
|
417
|
+
```bash
|
|
418
|
+
$ cp eval-set-sample.json eval-set.json
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
add, remove, or replace prompts:
|
|
422
|
+
|
|
423
|
+
```json
|
|
424
|
+
{
|
|
425
|
+
"id": 4,
|
|
426
|
+
"category": "debugging",
|
|
427
|
+
"title": "Connection Pool Exhaustion",
|
|
428
|
+
"prompt": "A web service starts returning 503s after 2 hours under steady load. DB connections are not being returned to the pool. What are the most likely causes in priority order?",
|
|
429
|
+
"rubric": {
|
|
430
|
+
"3": "Identifies: unclosed connections in error paths, missing finally/context manager, connection leak on timeout. Correct priority.",
|
|
431
|
+
"2": "Gets top cause right but weak prioritization.",
|
|
432
|
+
"1": "Some valid causes but misses the leak pattern.",
|
|
433
|
+
"0": "Doesn't understand connection pooling."
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
one prompt can use an image (for vision models). set `image_filename` in config.yml and place the image in the repo root or use `--image-dir`.
|
|
439
|
+
|
|
440
|
+
prompts can be single-turn (a `prompt` string) or multi-turn (a `turns` array) — see [multi-turn prompts](#multi-turn-prompts) above.<br/>
|
|
441
|
+
when a model returns native tool calls (OpenAI `tool_calls` format), bench captures them as JSON in the response so the judge can see what the model tried to do.
|
|
442
|
+
|
|
443
|
+
# thinking models
|
|
444
|
+
|
|
445
|
+
Qwen3.5, DeepSeek, and other reasoning models emit `<think>...</think>` blocks before their answer.<br/>
|
|
446
|
+
bench handles this automatically:
|
|
447
|
+
|
|
448
|
+
- the thinking is **separated** from the response and stored in a `thinking` field
|
|
449
|
+
- only the **response after thinking** is sent to the judge
|
|
450
|
+
- if thinking consumes all tokens and no answer is produced, it's flagged as an error
|
|
451
|
+
- `thinking: null` in config means "let models think naturally" — this is the recommended default
|
|
452
|
+
- some servers (oMLX, Ollama) leak think tags into the content field — bench strips them
|
|
453
|
+
|
|
454
|
+
if you want to explicitly disable thinking:
|
|
455
|
+
|
|
456
|
+
```yaml
|
|
457
|
+
thinking: 0
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
or set a budget:
|
|
461
|
+
|
|
462
|
+
```yaml
|
|
463
|
+
thinking: 4096
|
|
464
|
+
```
|
|
465
|
+
|
|
466
|
+
# license
|
|
467
|
+
|
|
468
|
+
Copyright © 2026 tolitius
|
|
469
|
+
|
|
470
|
+
Distributed under the Eclipse Public License either version 1.0 or (at
|
|
471
|
+
your option) any later version.
|