arbiter-cli 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arbiter_cli-0.1.0/LICENSE +21 -0
- arbiter_cli-0.1.0/PKG-INFO +299 -0
- arbiter_cli-0.1.0/README.md +262 -0
- arbiter_cli-0.1.0/arbiter/__init__.py +3 -0
- arbiter_cli-0.1.0/arbiter/cli/__init__.py +0 -0
- arbiter_cli-0.1.0/arbiter/cli/app.py +699 -0
- arbiter_cli-0.1.0/arbiter/cli/display.py +381 -0
- arbiter_cli-0.1.0/arbiter/core/__init__.py +0 -0
- arbiter_cli-0.1.0/arbiter/core/benchmarks.py +804 -0
- arbiter_cli-0.1.0/arbiter/core/config.py +137 -0
- arbiter_cli-0.1.0/arbiter/core/discover.py +184 -0
- arbiter_cli-0.1.0/arbiter/core/judge.py +193 -0
- arbiter_cli-0.1.0/arbiter/core/leaderboard.py +197 -0
- arbiter_cli-0.1.0/arbiter/core/metrics.py +367 -0
- arbiter_cli-0.1.0/arbiter/core/providers/__init__.py +19 -0
- arbiter_cli-0.1.0/arbiter/core/providers/anthropic_provider.py +133 -0
- arbiter_cli-0.1.0/arbiter/core/providers/base.py +62 -0
- arbiter_cli-0.1.0/arbiter/core/providers/factory.py +79 -0
- arbiter_cli-0.1.0/arbiter/core/providers/google_provider.py +126 -0
- arbiter_cli-0.1.0/arbiter/core/providers/ollama.py +103 -0
- arbiter_cli-0.1.0/arbiter/core/providers/openai_provider.py +120 -0
- arbiter_cli-0.1.0/arbiter/core/runner.py +257 -0
- arbiter_cli-0.1.0/arbiter/core/swe/__init__.py +1 -0
- arbiter_cli-0.1.0/arbiter/core/swe/container.py +158 -0
- arbiter_cli-0.1.0/arbiter/core/swe/runner.py +220 -0
- arbiter_cli-0.1.0/arbiter/core/swe/sandbox.py +111 -0
- arbiter_cli-0.1.0/arbiter/core/swe/test_packs.py +548 -0
- arbiter_cli-0.1.0/arbiter/dashboard/__init__.py +0 -0
- arbiter_cli-0.1.0/arbiter/dashboard/frontend/dist/assets/index-1tkxJouQ.css +1 -0
- arbiter_cli-0.1.0/arbiter/dashboard/frontend/dist/assets/index-dHa4zmvw.js +298 -0
- arbiter_cli-0.1.0/arbiter/dashboard/frontend/dist/index.html +16 -0
- arbiter_cli-0.1.0/arbiter/dashboard/server.py +426 -0
- arbiter_cli-0.1.0/arbiter_cli.egg-info/PKG-INFO +299 -0
- arbiter_cli-0.1.0/arbiter_cli.egg-info/SOURCES.txt +40 -0
- arbiter_cli-0.1.0/arbiter_cli.egg-info/dependency_links.txt +1 -0
- arbiter_cli-0.1.0/arbiter_cli.egg-info/entry_points.txt +2 -0
- arbiter_cli-0.1.0/arbiter_cli.egg-info/requires.txt +12 -0
- arbiter_cli-0.1.0/arbiter_cli.egg-info/top_level.txt +1 -0
- arbiter_cli-0.1.0/pyproject.toml +66 -0
- arbiter_cli-0.1.0/setup.cfg +33 -0
- arbiter_cli-0.1.0/setup.py +2 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 BasaltLabs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arbiter-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: The final word on your local models. Compare LLMs side-by-side with animated visualizations.
|
|
5
|
+
Author-email: BasaltLabs <hello@basaltlabs.app>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Basaltlabs-app/Arbiter
|
|
8
|
+
Project-URL: Repository, https://github.com/Basaltlabs-app/Arbiter
|
|
9
|
+
Project-URL: Issues, https://github.com/Basaltlabs-app/Arbiter/issues
|
|
10
|
+
Keywords: llm,benchmark,comparison,ollama,local-ai,gemma,qwen,llama
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.9
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: typer>=0.9.0
|
|
26
|
+
Requires-Dist: rich>=13.0.0
|
|
27
|
+
Requires-Dist: httpx>=0.25.0
|
|
28
|
+
Requires-Dist: psutil>=5.9.0
|
|
29
|
+
Requires-Dist: fastapi>=0.100.0
|
|
30
|
+
Requires-Dist: uvicorn>=0.23.0
|
|
31
|
+
Requires-Dist: websockets>=12.0
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
35
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
36
|
+
Dynamic: license-file
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<img src="https://img.shields.io/badge/arbiter-v0.1.0-00d4ff?style=for-the-badge" alt="version" />
|
|
40
|
+
</p>
|
|
41
|
+
|
|
42
|
+
<h1 align="center">Arbiter</h1>
|
|
43
|
+
|
|
44
|
+
<p align="center">
|
|
45
|
+
<strong>The final word on your local models.</strong><br>
|
|
46
|
+
Real benchmarks. Real code tests. Beautiful dashboard. Fully local.
|
|
47
|
+
</p>
|
|
48
|
+
|
|
49
|
+
<p align="center">
|
|
50
|
+
<a href="#install">Install</a> •
|
|
51
|
+
<a href="#quick-start">Quick Start</a> •
|
|
52
|
+
<a href="#what-it-tests">What It Tests</a> •
|
|
53
|
+
<a href="#dashboard">Dashboard</a> •
|
|
54
|
+
<a href="#swe-testing">SWE Testing</a> •
|
|
55
|
+
<a href="#how-scoring-works">Scoring</a>
|
|
56
|
+
</p>
|
|
57
|
+
|
|
58
|
+
<p align="center">
|
|
59
|
+
<img src="https://img.shields.io/pypi/v/arbiter-cli?color=00d4ff" alt="PyPI" />
|
|
60
|
+
<img src="https://img.shields.io/github/license/Basaltlabs-app/Arbiter" alt="License" />
|
|
61
|
+
<img src="https://img.shields.io/badge/AI-100%25%20Local-10b981" alt="Local AI" />
|
|
62
|
+
<img src="https://img.shields.io/badge/models-any%20Ollama%20%2B%20cloud-7c3aed" alt="Models" />
|
|
63
|
+
<img src="https://img.shields.io/badge/judge-no%20LLM%20vibes-f59e0b" alt="No vibes" />
|
|
64
|
+
</p>
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
New models drop every week. Gemma 4. Qwen 3.5. Llama 4. But lab benchmarks don't tell you how they run **on your machine** for **your use cases**.
|
|
69
|
+
|
|
70
|
+
**Arbiter** runs real automated tests against any model, verifies results programmatically (not by asking another LLM), and shows you exactly why one model beats another.
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install arbiter-cli
|
|
74
|
+
arbiter benchmark
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
No API keys. No cloud. Everything runs on your hardware.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Install
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
pip install arbiter-cli
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**Requirements:**
|
|
88
|
+
- Python 3.9+
|
|
89
|
+
- [Ollama](https://ollama.com) with at least one model installed
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
# Install Ollama, then pull a model:
|
|
93
|
+
ollama pull qwen3.5:4b
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# Open the dashboard (recommended -- everything in the browser)
|
|
100
|
+
arbiter dashboard
|
|
101
|
+
|
|
102
|
+
# Run the benchmark suite from terminal
|
|
103
|
+
arbiter benchmark
|
|
104
|
+
|
|
105
|
+
# Quick benchmark (8 key tests, faster)
|
|
106
|
+
arbiter benchmark --quick
|
|
107
|
+
|
|
108
|
+
# Compare models with your own prompt
|
|
109
|
+
arbiter run "explain how a hash table works"
|
|
110
|
+
|
|
111
|
+
# List your installed models
|
|
112
|
+
arbiter discover
|
|
113
|
+
|
|
114
|
+
# View persistent rankings
|
|
115
|
+
arbiter leaderboard
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
The dashboard auto-detects your installed models. Pick models, type a prompt or hit "Run Benchmark", see results.
|
|
119
|
+
|
|
120
|
+
## What It Tests
|
|
121
|
+
|
|
122
|
+
**20 automated tests across 8 categories.** Every test has a programmatic pass/fail check. No model judges another model.
|
|
123
|
+
|
|
124
|
+
| Category | What It Tests | How It Verifies |
|
|
125
|
+
|---|---|---|
|
|
126
|
+
| **Instruction Following** | Does it do exactly what you asked? | Line count, format regex, constraint check |
|
|
127
|
+
| **Code Generation** | Can it fix bugs and build real code? | AST parsing, structural analysis |
|
|
128
|
+
| **Factual Accuracy** | Real facts or hallucinations? | Ground truth match against known answers |
|
|
129
|
+
| **Reasoning** | Multi-step logic, math, time | Expected answer comparison |
|
|
130
|
+
| **Consistency** | Same question 3 ways = same answer? | Cross-run similarity scoring |
|
|
131
|
+
| **Pressure Resistance** | Does it cave when you push back? | Sycophancy detection (correct answer flip) |
|
|
132
|
+
| **Speed** | Tokens/sec on YOUR hardware | Direct measurement |
|
|
133
|
+
| **Context Recall** | Find info hidden in longer text | Hidden value retrieval |
|
|
134
|
+
|
|
135
|
+
### Code Generation Tests (SWE-bench inspired)
|
|
136
|
+
|
|
137
|
+
These aren't toy FizzBuzz problems. Arbiter gives models real buggy code and checks if the fix actually works:
|
|
138
|
+
|
|
139
|
+
| Test | What the Model Must Do |
|
|
140
|
+
|---|---|
|
|
141
|
+
| **Binary Search Fix** | Find the off-by-one bug, fix it. 7 pytest assertions. |
|
|
142
|
+
| **Edge Case Handling** | Write safe_divide with zero, type, and negative handling |
|
|
143
|
+
| **Data Structure** | Build a Stack class with proper error handling |
|
|
144
|
+
| **API Design** | Sliding window rate limiter with time tracking |
|
|
145
|
+
| **Code Comprehension** | Predict output of tricky Python (slice references) |
|
|
146
|
+
| **JSON Output** | Generate valid, parseable structured data |
|
|
147
|
+
|
|
148
|
+
### Example Output
|
|
149
|
+
|
|
150
|
+
```
|
|
151
|
+
Overall Scores
|
|
152
|
+
Higher is better. 100 = perfect on every test.
|
|
153
|
+
|
|
154
|
+
gemma4:e2b ████████████████░░░░░░░░░░░░░░░░ 53/100 (5/8 passed)
|
|
155
|
+
qwen3.5:4b ██████░░░░░░░░░░░░░░░░░░░░░░░░░░ 21/100 (2/8 passed)
|
|
156
|
+
|
|
157
|
+
Factual Accuracy Does it give real facts or make things up?
|
|
158
|
+
gemma4:e2b 100% BEST
|
|
159
|
+
qwen3.5:4b 0%
|
|
160
|
+
Known factual answers: PASS FAIL
|
|
161
|
+
|
|
162
|
+
Code Generation Can it write working code?
|
|
163
|
+
gemma4:e2b 0%
|
|
164
|
+
qwen3.5:4b 50% BEST
|
|
165
|
+
Fix binary search bug: FAIL PASS
|
|
166
|
+
|
|
167
|
+
Winner: gemma4:e2b scored 53/100, passing 5 of 8 tests.
|
|
168
|
+
Strong at: factual accuracy, pressure resistance, reasoning.
|
|
169
|
+
Weaker at: code generation (beaten by qwen3.5:4b).
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Every number tells you what was tested and what happened. No black boxes.
|
|
173
|
+
|
|
174
|
+
## Dashboard
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
arbiter dashboard
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Opens a full GUI in your browser at `http://127.0.0.1:7878`. Everything you can do in the terminal, you can do from the dashboard:
|
|
181
|
+
|
|
182
|
+
- **Select models** from auto-detected installed list
|
|
183
|
+
- **Type prompts** and run comparisons
|
|
184
|
+
- **Run benchmarks** with one click
|
|
185
|
+
- **View results** with animated charts and graphs
|
|
186
|
+
- **Track rankings** across all your comparisons
|
|
187
|
+
|
|
188
|
+
### Tabs
|
|
189
|
+
|
|
190
|
+
| Tab | What It Shows |
|
|
191
|
+
|---|---|
|
|
192
|
+
| **Arena** | Live token race, winner announcement, model output cards |
|
|
193
|
+
| **Benchmark** | Full test suite with category breakdown and pass/fail per test |
|
|
194
|
+
| **Speed** | Animated bar charts for tok/s, TTFT, total generation time |
|
|
195
|
+
| **Quality** | Radar chart comparing quality dimensions |
|
|
196
|
+
| **Graph** | Force-directed node graph showing model relationships |
|
|
197
|
+
| **Rankings** | Persistent ELO leaderboard with sparklines |
|
|
198
|
+
| **Help** | Getting started guide, FAQ, terminal command reference |
|
|
199
|
+
|
|
200
|
+
The dashboard shows your system RAM and warns if a model is too large for your machine.
|
|
201
|
+
|
|
202
|
+
## SWE Testing
|
|
203
|
+
|
|
204
|
+
Real software engineering tests in the style of [SWE-bench](https://www.swebench.com/). Models receive buggy code + a bug report, write a fix, and the fix is verified by running pytest.
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
arbiter swe
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
7 built-in challenges across bug fixes and algorithm implementations. Tests run in a sandboxed subprocess (no Docker required) or optionally in Docker containers with `--docker`.
|
|
211
|
+
|
|
212
|
+
| Challenge | Type | Pytest Assertions |
|
|
213
|
+
|---|---|---|
|
|
214
|
+
| Binary Search Off-by-One | Bug fix | 7 |
|
|
215
|
+
| LRU Cache Eviction | Bug fix | 5 |
|
|
216
|
+
| Flatten Nested List | Bug fix | 6 |
|
|
217
|
+
| Rate Limiter Window | Implementation | 4 |
|
|
218
|
+
| Linked List Cycle (O(1) memory) | Implementation | 6 |
|
|
219
|
+
| Two Sum (O(n) time) | Algorithm | 5 |
|
|
220
|
+
| Merge Intervals | Algorithm | 6 |
|
|
221
|
+
|
|
222
|
+
## How Scoring Works
|
|
223
|
+
|
|
224
|
+
### Benchmark Suite
|
|
225
|
+
|
|
226
|
+
Each test is **pass/fail** with programmatic verification. The overall score is the percentage of tests passed. Category scores are averages within each category. No LLM judges another LLM.
|
|
227
|
+
|
|
228
|
+
### Run Comparison
|
|
229
|
+
|
|
230
|
+
When comparing models with a custom prompt, Arbiter uses a **transparent composite formula**:
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
Score = Speed(30%) + Quality(50%) + Responsiveness(20%)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
- **Speed:** `model_tps / best_tps` (higher tokens/sec = better)
|
|
237
|
+
- **Responsiveness:** `best_ttft / model_ttft` (lower time-to-first-token = better)
|
|
238
|
+
- **Quality:** `model_score / 10` (from optional LLM judge, or skipped with `--no-judge`)
|
|
239
|
+
|
|
240
|
+
You see every component. You see the raw values. You see exactly why Model A scored higher than Model B.
|
|
241
|
+
|
|
242
|
+
### ELO Leaderboard
|
|
243
|
+
|
|
244
|
+
Every comparison updates a persistent [ELO rating](https://en.wikipedia.org/wiki/Elo_rating_system) stored in `~/.arbiter/leaderboard.json`. Rankings build over time across all your comparisons. Multi-model comparisons scale the K-factor to prevent rating inflation.
|
|
245
|
+
|
|
246
|
+
## Multi-Provider Support
|
|
247
|
+
|
|
248
|
+
Arbiter works with any model you can run:
|
|
249
|
+
|
|
250
|
+
| Provider | Format | Example |
|
|
251
|
+
|---|---|---|
|
|
252
|
+
| **Ollama** (local) | `model_name` | `qwen3.5:4b` |
|
|
253
|
+
| **OpenAI** | `openai:model` | `openai:gpt-4o` |
|
|
254
|
+
| **Anthropic** | `anthropic:model` | `anthropic:claude-sonnet-4-20250514` |
|
|
255
|
+
| **Google** | `google:model` | `google:gemini-2.0-flash` |
|
|
256
|
+
| **Any OpenAI-compatible** | `http://host:port/v1:model` | Custom endpoints |
|
|
257
|
+
|
|
258
|
+
Cloud providers require API keys via environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`).
|
|
259
|
+
|
|
260
|
+
## Low RAM? No Problem
|
|
261
|
+
|
|
262
|
+
Arbiter was built and tested on an **8GB M1 MacBook Air**. Sequential mode runs one model at a time so each gets your full RAM:
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
arbiter benchmark --seq
|
|
266
|
+
arbiter run "your prompt" --seq
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
The dashboard shows system RAM and warns if a model is too large.
|
|
270
|
+
|
|
271
|
+
## Philosophy
|
|
272
|
+
|
|
273
|
+
- **Your hardware, your results.** Lab benchmarks don't tell you how a model runs on your machine.
|
|
274
|
+
- **Your prompts, your answers.** Generic benchmarks don't test what you actually use AI for.
|
|
275
|
+
- **No vibes.** Every test has a programmatic pass/fail. No "this feels like a 7/10."
|
|
276
|
+
- **Fully local.** Your code and prompts never leave your computer.
|
|
277
|
+
- **Transparent.** See every score, every formula, every raw value. No black boxes.
|
|
278
|
+
|
|
279
|
+
## Contributing
|
|
280
|
+
|
|
281
|
+
We welcome contributions! Areas we need help with:
|
|
282
|
+
|
|
283
|
+
- **New test packs** -- submit benchmark tests for specific domains (data science, web dev, etc.)
|
|
284
|
+
- **New providers** -- add support for more LLM APIs
|
|
285
|
+
- **Dashboard improvements** -- better visualizations, mobile responsive
|
|
286
|
+
- **Documentation** -- tutorials, guides, translations
|
|
287
|
+
|
|
288
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
|
|
289
|
+
|
|
290
|
+
## License
|
|
291
|
+
|
|
292
|
+
MIT
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
296
|
+
<p align="center">
|
|
297
|
+
Built by <a href="https://basaltlabs.app">BasaltLabs</a><br>
|
|
298
|
+
<sub>The final word on your local models.</sub>
|
|
299
|
+
</p>
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://img.shields.io/badge/arbiter-v0.1.0-00d4ff?style=for-the-badge" alt="version" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">Arbiter</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<strong>The final word on your local models.</strong><br>
|
|
9
|
+
Real benchmarks. Real code tests. Beautiful dashboard. Fully local.
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<a href="#install">Install</a> •
|
|
14
|
+
<a href="#quick-start">Quick Start</a> •
|
|
15
|
+
<a href="#what-it-tests">What It Tests</a> •
|
|
16
|
+
<a href="#dashboard">Dashboard</a> •
|
|
17
|
+
<a href="#swe-testing">SWE Testing</a> •
|
|
18
|
+
<a href="#how-scoring-works">Scoring</a>
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<img src="https://img.shields.io/pypi/v/arbiter-cli?color=00d4ff" alt="PyPI" />
|
|
23
|
+
<img src="https://img.shields.io/github/license/Basaltlabs-app/Arbiter" alt="License" />
|
|
24
|
+
<img src="https://img.shields.io/badge/AI-100%25%20Local-10b981" alt="Local AI" />
|
|
25
|
+
<img src="https://img.shields.io/badge/models-any%20Ollama%20%2B%20cloud-7c3aed" alt="Models" />
|
|
26
|
+
<img src="https://img.shields.io/badge/judge-no%20LLM%20vibes-f59e0b" alt="No vibes" />
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
New models drop every week. Gemma 4. Qwen 3.5. Llama 4. But lab benchmarks don't tell you how they run **on your machine** for **your use cases**.
|
|
32
|
+
|
|
33
|
+
**Arbiter** runs real automated tests against any model, verifies results programmatically (not by asking another LLM), and shows you exactly why one model beats another.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install arbiter-cli
|
|
37
|
+
arbiter benchmark
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
No API keys. No cloud. Everything runs on your hardware.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Install
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install arbiter-cli
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**Requirements:**
|
|
51
|
+
- Python 3.9+
|
|
52
|
+
- [Ollama](https://ollama.com) with at least one model installed
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Install Ollama, then pull a model:
|
|
56
|
+
ollama pull qwen3.5:4b
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quick Start
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Open the dashboard (recommended -- everything in the browser)
|
|
63
|
+
arbiter dashboard
|
|
64
|
+
|
|
65
|
+
# Run the benchmark suite from terminal
|
|
66
|
+
arbiter benchmark
|
|
67
|
+
|
|
68
|
+
# Quick benchmark (8 key tests, faster)
|
|
69
|
+
arbiter benchmark --quick
|
|
70
|
+
|
|
71
|
+
# Compare models with your own prompt
|
|
72
|
+
arbiter run "explain how a hash table works"
|
|
73
|
+
|
|
74
|
+
# List your installed models
|
|
75
|
+
arbiter discover
|
|
76
|
+
|
|
77
|
+
# View persistent rankings
|
|
78
|
+
arbiter leaderboard
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The dashboard auto-detects your installed models. Pick models, type a prompt or hit "Run Benchmark", see results.
|
|
82
|
+
|
|
83
|
+
## What It Tests
|
|
84
|
+
|
|
85
|
+
**20 automated tests across 8 categories.** Every test has a programmatic pass/fail check. No model judges another model.
|
|
86
|
+
|
|
87
|
+
| Category | What It Tests | How It Verifies |
|
|
88
|
+
|---|---|---|
|
|
89
|
+
| **Instruction Following** | Does it do exactly what you asked? | Line count, format regex, constraint check |
|
|
90
|
+
| **Code Generation** | Can it fix bugs and build real code? | AST parsing, structural analysis |
|
|
91
|
+
| **Factual Accuracy** | Real facts or hallucinations? | Ground truth match against known answers |
|
|
92
|
+
| **Reasoning** | Multi-step logic, math, time | Expected answer comparison |
|
|
93
|
+
| **Consistency** | Same question 3 ways = same answer? | Cross-run similarity scoring |
|
|
94
|
+
| **Pressure Resistance** | Does it cave when you push back? | Sycophancy detection (correct answer flip) |
|
|
95
|
+
| **Speed** | Tokens/sec on YOUR hardware | Direct measurement |
|
|
96
|
+
| **Context Recall** | Find info hidden in longer text | Hidden value retrieval |
|
|
97
|
+
|
|
98
|
+
### Code Generation Tests (SWE-bench inspired)
|
|
99
|
+
|
|
100
|
+
These aren't toy FizzBuzz problems. Arbiter gives models real buggy code and checks if the fix actually works:
|
|
101
|
+
|
|
102
|
+
| Test | What the Model Must Do |
|
|
103
|
+
|---|---|
|
|
104
|
+
| **Binary Search Fix** | Find the off-by-one bug, fix it. 7 pytest assertions. |
|
|
105
|
+
| **Edge Case Handling** | Write safe_divide with zero, type, and negative handling |
|
|
106
|
+
| **Data Structure** | Build a Stack class with proper error handling |
|
|
107
|
+
| **API Design** | Sliding window rate limiter with time tracking |
|
|
108
|
+
| **Code Comprehension** | Predict output of tricky Python (slice references) |
|
|
109
|
+
| **JSON Output** | Generate valid, parseable structured data |
|
|
110
|
+
|
|
111
|
+
### Example Output
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
Overall Scores
|
|
115
|
+
Higher is better. 100 = perfect on every test.
|
|
116
|
+
|
|
117
|
+
gemma4:e2b ████████████████░░░░░░░░░░░░░░░░ 53/100 (5/8 passed)
|
|
118
|
+
qwen3.5:4b ██████░░░░░░░░░░░░░░░░░░░░░░░░░░ 21/100 (2/8 passed)
|
|
119
|
+
|
|
120
|
+
Factual Accuracy Does it give real facts or make things up?
|
|
121
|
+
gemma4:e2b 100% BEST
|
|
122
|
+
qwen3.5:4b 0%
|
|
123
|
+
Known factual answers: PASS FAIL
|
|
124
|
+
|
|
125
|
+
Code Generation Can it write working code?
|
|
126
|
+
gemma4:e2b 0%
|
|
127
|
+
qwen3.5:4b 50% BEST
|
|
128
|
+
Fix binary search bug: FAIL PASS
|
|
129
|
+
|
|
130
|
+
Winner: gemma4:e2b scored 53/100, passing 5 of 8 tests.
|
|
131
|
+
Strong at: factual accuracy, pressure resistance, reasoning.
|
|
132
|
+
Weaker at: code generation (beaten by qwen3.5:4b).
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
Every number tells you what was tested and what happened. No black boxes.
|
|
136
|
+
|
|
137
|
+
## Dashboard
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
arbiter dashboard
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Opens a full GUI in your browser at `http://127.0.0.1:7878`. Everything you can do in the terminal, you can do from the dashboard:
|
|
144
|
+
|
|
145
|
+
- **Select models** from auto-detected installed list
|
|
146
|
+
- **Type prompts** and run comparisons
|
|
147
|
+
- **Run benchmarks** with one click
|
|
148
|
+
- **View results** with animated charts and graphs
|
|
149
|
+
- **Track rankings** across all your comparisons
|
|
150
|
+
|
|
151
|
+
### Tabs
|
|
152
|
+
|
|
153
|
+
| Tab | What It Shows |
|
|
154
|
+
|---|---|
|
|
155
|
+
| **Arena** | Live token race, winner announcement, model output cards |
|
|
156
|
+
| **Benchmark** | Full test suite with category breakdown and pass/fail per test |
|
|
157
|
+
| **Speed** | Animated bar charts for tok/s, TTFT, total generation time |
|
|
158
|
+
| **Quality** | Radar chart comparing quality dimensions |
|
|
159
|
+
| **Graph** | Force-directed node graph showing model relationships |
|
|
160
|
+
| **Rankings** | Persistent ELO leaderboard with sparklines |
|
|
161
|
+
| **Help** | Getting started guide, FAQ, terminal command reference |
|
|
162
|
+
|
|
163
|
+
The dashboard shows your system RAM and warns if a model is too large for your machine.
|
|
164
|
+
|
|
165
|
+
## SWE Testing
|
|
166
|
+
|
|
167
|
+
Real software engineering tests in the style of [SWE-bench](https://www.swebench.com/). Models receive buggy code + a bug report, write a fix, and the fix is verified by running pytest.
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
arbiter swe
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
7 built-in challenges across bug fixes and algorithm implementations. Tests run in a sandboxed subprocess (no Docker required) or optionally in Docker containers with `--docker`.
|
|
174
|
+
|
|
175
|
+
| Challenge | Type | Pytest Assertions |
|
|
176
|
+
|---|---|---|
|
|
177
|
+
| Binary Search Off-by-One | Bug fix | 7 |
|
|
178
|
+
| LRU Cache Eviction | Bug fix | 5 |
|
|
179
|
+
| Flatten Nested List | Bug fix | 6 |
|
|
180
|
+
| Rate Limiter Window | Implementation | 4 |
|
|
181
|
+
| Linked List Cycle (O(1) memory) | Implementation | 6 |
|
|
182
|
+
| Two Sum (O(n) time) | Algorithm | 5 |
|
|
183
|
+
| Merge Intervals | Algorithm | 6 |
|
|
184
|
+
|
|
185
|
+
## How Scoring Works
|
|
186
|
+
|
|
187
|
+
### Benchmark Suite
|
|
188
|
+
|
|
189
|
+
Each test is **pass/fail** with programmatic verification. The overall score is the percentage of tests passed. Category scores are averages within each category. No LLM judges another LLM.
|
|
190
|
+
|
|
191
|
+
### Run Comparison
|
|
192
|
+
|
|
193
|
+
When comparing models with a custom prompt, Arbiter uses a **transparent composite formula**:
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
Score = Speed(30%) + Quality(50%) + Responsiveness(20%)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
- **Speed:** `model_tps / best_tps` (higher tokens/sec = better)
|
|
200
|
+
- **Responsiveness:** `best_ttft / model_ttft` (lower time-to-first-token = better)
|
|
201
|
+
- **Quality:** `model_score / 10` (from optional LLM judge, or skipped with `--no-judge`)
|
|
202
|
+
|
|
203
|
+
You see every component. You see the raw values. You see exactly why Model A scored higher than Model B.
|
|
204
|
+
|
|
205
|
+
### ELO Leaderboard
|
|
206
|
+
|
|
207
|
+
Every comparison updates a persistent [ELO rating](https://en.wikipedia.org/wiki/Elo_rating_system) stored in `~/.arbiter/leaderboard.json`. Rankings build over time across all your comparisons. Multi-model comparisons scale the K-factor to prevent rating inflation.
|
|
208
|
+
|
|
209
|
+
## Multi-Provider Support
|
|
210
|
+
|
|
211
|
+
Arbiter works with any model you can run:
|
|
212
|
+
|
|
213
|
+
| Provider | Format | Example |
|
|
214
|
+
|---|---|---|
|
|
215
|
+
| **Ollama** (local) | `model_name` | `qwen3.5:4b` |
|
|
216
|
+
| **OpenAI** | `openai:model` | `openai:gpt-4o` |
|
|
217
|
+
| **Anthropic** | `anthropic:model` | `anthropic:claude-sonnet-4-20250514` |
|
|
218
|
+
| **Google** | `google:model` | `google:gemini-2.0-flash` |
|
|
219
|
+
| **Any OpenAI-compatible** | `http://host:port/v1:model` | Custom endpoints |
|
|
220
|
+
|
|
221
|
+
Cloud providers require API keys via environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`).
|
|
222
|
+
|
|
223
|
+
## Low RAM? No Problem
|
|
224
|
+
|
|
225
|
+
Arbiter was built and tested on an **8GB M1 MacBook Air**. Sequential mode runs one model at a time so each gets your full RAM:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
arbiter benchmark --seq
|
|
229
|
+
arbiter run "your prompt" --seq
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
The dashboard shows system RAM and warns if a model is too large.
|
|
233
|
+
|
|
234
|
+
## Philosophy
|
|
235
|
+
|
|
236
|
+
- **Your hardware, your results.** Lab benchmarks don't tell you how a model runs on your machine.
|
|
237
|
+
- **Your prompts, your answers.** Generic benchmarks don't test what you actually use AI for.
|
|
238
|
+
- **No vibes.** Every test has a programmatic pass/fail. No "this feels like a 7/10."
|
|
239
|
+
- **Fully local.** Your code and prompts never leave your computer.
|
|
240
|
+
- **Transparent.** See every score, every formula, every raw value. No black boxes.
|
|
241
|
+
|
|
242
|
+
## Contributing
|
|
243
|
+
|
|
244
|
+
We welcome contributions! Areas we need help with:
|
|
245
|
+
|
|
246
|
+
- **New test packs** -- submit benchmark tests for specific domains (data science, web dev, etc.)
|
|
247
|
+
- **New providers** -- add support for more LLM APIs
|
|
248
|
+
- **Dashboard improvements** -- better visualizations, mobile responsive
|
|
249
|
+
- **Documentation** -- tutorials, guides, translations
|
|
250
|
+
|
|
251
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
|
|
252
|
+
|
|
253
|
+
## License
|
|
254
|
+
|
|
255
|
+
MIT
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
<p align="center">
|
|
260
|
+
Built by <a href="https://basaltlabs.app">BasaltLabs</a><br>
|
|
261
|
+
<sub>The final word on your local models.</sub>
|
|
262
|
+
</p>
|
|
File without changes
|