arbiter-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. arbiter_cli-0.1.0/LICENSE +21 -0
  2. arbiter_cli-0.1.0/PKG-INFO +299 -0
  3. arbiter_cli-0.1.0/README.md +262 -0
  4. arbiter_cli-0.1.0/arbiter/__init__.py +3 -0
  5. arbiter_cli-0.1.0/arbiter/cli/__init__.py +0 -0
  6. arbiter_cli-0.1.0/arbiter/cli/app.py +699 -0
  7. arbiter_cli-0.1.0/arbiter/cli/display.py +381 -0
  8. arbiter_cli-0.1.0/arbiter/core/__init__.py +0 -0
  9. arbiter_cli-0.1.0/arbiter/core/benchmarks.py +804 -0
  10. arbiter_cli-0.1.0/arbiter/core/config.py +137 -0
  11. arbiter_cli-0.1.0/arbiter/core/discover.py +184 -0
  12. arbiter_cli-0.1.0/arbiter/core/judge.py +193 -0
  13. arbiter_cli-0.1.0/arbiter/core/leaderboard.py +197 -0
  14. arbiter_cli-0.1.0/arbiter/core/metrics.py +367 -0
  15. arbiter_cli-0.1.0/arbiter/core/providers/__init__.py +19 -0
  16. arbiter_cli-0.1.0/arbiter/core/providers/anthropic_provider.py +133 -0
  17. arbiter_cli-0.1.0/arbiter/core/providers/base.py +62 -0
  18. arbiter_cli-0.1.0/arbiter/core/providers/factory.py +79 -0
  19. arbiter_cli-0.1.0/arbiter/core/providers/google_provider.py +126 -0
  20. arbiter_cli-0.1.0/arbiter/core/providers/ollama.py +103 -0
  21. arbiter_cli-0.1.0/arbiter/core/providers/openai_provider.py +120 -0
  22. arbiter_cli-0.1.0/arbiter/core/runner.py +257 -0
  23. arbiter_cli-0.1.0/arbiter/core/swe/__init__.py +1 -0
  24. arbiter_cli-0.1.0/arbiter/core/swe/container.py +158 -0
  25. arbiter_cli-0.1.0/arbiter/core/swe/runner.py +220 -0
  26. arbiter_cli-0.1.0/arbiter/core/swe/sandbox.py +111 -0
  27. arbiter_cli-0.1.0/arbiter/core/swe/test_packs.py +548 -0
  28. arbiter_cli-0.1.0/arbiter/dashboard/__init__.py +0 -0
  29. arbiter_cli-0.1.0/arbiter/dashboard/frontend/dist/assets/index-1tkxJouQ.css +1 -0
  30. arbiter_cli-0.1.0/arbiter/dashboard/frontend/dist/assets/index-dHa4zmvw.js +298 -0
  31. arbiter_cli-0.1.0/arbiter/dashboard/frontend/dist/index.html +16 -0
  32. arbiter_cli-0.1.0/arbiter/dashboard/server.py +426 -0
  33. arbiter_cli-0.1.0/arbiter_cli.egg-info/PKG-INFO +299 -0
  34. arbiter_cli-0.1.0/arbiter_cli.egg-info/SOURCES.txt +40 -0
  35. arbiter_cli-0.1.0/arbiter_cli.egg-info/dependency_links.txt +1 -0
  36. arbiter_cli-0.1.0/arbiter_cli.egg-info/entry_points.txt +2 -0
  37. arbiter_cli-0.1.0/arbiter_cli.egg-info/requires.txt +12 -0
  38. arbiter_cli-0.1.0/arbiter_cli.egg-info/top_level.txt +1 -0
  39. arbiter_cli-0.1.0/pyproject.toml +66 -0
  40. arbiter_cli-0.1.0/setup.cfg +33 -0
  41. arbiter_cli-0.1.0/setup.py +2 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 BasaltLabs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: arbiter-cli
3
+ Version: 0.1.0
4
+ Summary: The final word on your local models. Compare LLMs side-by-side with animated visualizations.
5
+ Author-email: BasaltLabs <hello@basaltlabs.app>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Basaltlabs-app/Arbiter
8
+ Project-URL: Repository, https://github.com/Basaltlabs-app/Arbiter
9
+ Project-URL: Issues, https://github.com/Basaltlabs-app/Arbiter/issues
10
+ Keywords: llm,benchmark,comparison,ollama,local-ai,gemma,qwen,llama
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.9
23
+ Description-Content-Type: text/markdown
24
+ License-File: LICENSE
25
+ Requires-Dist: typer>=0.9.0
26
+ Requires-Dist: rich>=13.0.0
27
+ Requires-Dist: httpx>=0.25.0
28
+ Requires-Dist: psutil>=5.9.0
29
+ Requires-Dist: fastapi>=0.100.0
30
+ Requires-Dist: uvicorn>=0.23.0
31
+ Requires-Dist: websockets>=12.0
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
35
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
36
+ Dynamic: license-file
37
+
38
+ <p align="center">
39
+ <img src="https://img.shields.io/badge/arbiter-v0.1.0-00d4ff?style=for-the-badge" alt="version" />
40
+ </p>
41
+
42
+ <h1 align="center">Arbiter</h1>
43
+
44
+ <p align="center">
45
+ <strong>The final word on your local models.</strong><br>
46
+ Real benchmarks. Real code tests. Beautiful dashboard. Fully local.
47
+ </p>
48
+
49
+ <p align="center">
50
+ <a href="#install">Install</a> &bull;
51
+ <a href="#quick-start">Quick Start</a> &bull;
52
+ <a href="#what-it-tests">What It Tests</a> &bull;
53
+ <a href="#dashboard">Dashboard</a> &bull;
54
+ <a href="#swe-testing">SWE Testing</a> &bull;
55
+ <a href="#how-scoring-works">Scoring</a>
56
+ </p>
57
+
58
+ <p align="center">
59
+ <img src="https://img.shields.io/pypi/v/arbiter-cli?color=00d4ff" alt="PyPI" />
60
+ <img src="https://img.shields.io/github/license/Basaltlabs-app/Arbiter" alt="License" />
61
+ <img src="https://img.shields.io/badge/AI-100%25%20Local-10b981" alt="Local AI" />
62
+ <img src="https://img.shields.io/badge/models-any%20Ollama%20%2B%20cloud-7c3aed" alt="Models" />
63
+ <img src="https://img.shields.io/badge/judge-no%20LLM%20vibes-f59e0b" alt="No vibes" />
64
+ </p>
65
+
66
+ ---
67
+
68
+ New models drop every week. Gemma 4. Qwen 3.5. Llama 4. But lab benchmarks don't tell you how they run **on your machine** for **your use cases**.
69
+
70
+ **Arbiter** runs real automated tests against any model, verifies results programmatically (not by asking another LLM), and shows you exactly why one model beats another.
71
+
72
+ ```bash
73
+ pip install arbiter-cli
74
+ arbiter benchmark
75
+ ```
76
+
77
+ No API keys. No cloud. Everything runs on your hardware.
78
+
79
+ ---
80
+
81
+ ## Install
82
+
83
+ ```bash
84
+ pip install arbiter-cli
85
+ ```
86
+
87
+ **Requirements:**
88
+ - Python 3.9+
89
+ - [Ollama](https://ollama.com) with at least one model installed
90
+
91
+ ```bash
92
+ # Install Ollama, then pull a model:
93
+ ollama pull qwen3.5:4b
94
+ ```
95
+
96
+ ## Quick Start
97
+
98
+ ```bash
99
+ # Open the dashboard (recommended -- everything in the browser)
100
+ arbiter dashboard
101
+
102
+ # Run the benchmark suite from terminal
103
+ arbiter benchmark
104
+
105
+ # Quick benchmark (8 key tests, faster)
106
+ arbiter benchmark --quick
107
+
108
+ # Compare models with your own prompt
109
+ arbiter run "explain how a hash table works"
110
+
111
+ # List your installed models
112
+ arbiter discover
113
+
114
+ # View persistent rankings
115
+ arbiter leaderboard
116
+ ```
117
+
118
+ The dashboard auto-detects your installed models. Pick models, type a prompt or hit "Run Benchmark", see results.
119
+
120
+ ## What It Tests
121
+
122
+ **20 automated tests across 8 categories.** Every test has a programmatic pass/fail check. No model judges another model.
123
+
124
+ | Category | What It Tests | How It Verifies |
125
+ |---|---|---|
126
+ | **Instruction Following** | Does it do exactly what you asked? | Line count, format regex, constraint check |
127
+ | **Code Generation** | Can it fix bugs and build real code? | AST parsing, structural analysis |
128
+ | **Factual Accuracy** | Real facts or hallucinations? | Ground truth match against known answers |
129
+ | **Reasoning** | Multi-step logic, math, time | Expected answer comparison |
130
+ | **Consistency** | Same question 3 ways = same answer? | Cross-run similarity scoring |
131
+ | **Pressure Resistance** | Does it cave when you push back? | Sycophancy detection (correct answer flip) |
132
+ | **Speed** | Tokens/sec on YOUR hardware | Direct measurement |
133
+ | **Context Recall** | Find info hidden in longer text | Hidden value retrieval |
134
+
135
+ ### Code Generation Tests (SWE-bench inspired)
136
+
137
+ These aren't toy FizzBuzz problems. Arbiter gives models real buggy code and checks if the fix actually works:
138
+
139
+ | Test | What the Model Must Do |
140
+ |---|---|
141
+ | **Binary Search Fix** | Find the off-by-one bug, fix it. 7 pytest assertions. |
142
+ | **Edge Case Handling** | Write safe_divide with zero, type, and negative handling |
143
+ | **Data Structure** | Build a Stack class with proper error handling |
144
+ | **API Design** | Sliding window rate limiter with time tracking |
145
+ | **Code Comprehension** | Predict output of tricky Python (slice references) |
146
+ | **JSON Output** | Generate valid, parseable structured data |
147
+
148
+ ### Example Output
149
+
150
+ ```
151
+ Overall Scores
152
+ Higher is better. 100 = perfect on every test.
153
+
154
+ gemma4:e2b ████████████████░░░░░░░░░░░░░░░░ 53/100 (5/8 passed)
155
+ qwen3.5:4b ██████░░░░░░░░░░░░░░░░░░░░░░░░░░ 21/100 (2/8 passed)
156
+
157
+ Factual Accuracy Does it give real facts or make things up?
158
+ gemma4:e2b 100% BEST
159
+ qwen3.5:4b 0%
160
+ Known factual answers: PASS FAIL
161
+
162
+ Code Generation Can it write working code?
163
+ gemma4:e2b 0%
164
+ qwen3.5:4b 50% BEST
165
+ Fix binary search bug: FAIL PASS
166
+
167
+ Winner: gemma4:e2b scored 53/100, passing 5 of 8 tests.
168
+ Strong at: factual accuracy, pressure resistance, reasoning.
169
+ Weaker at: code generation (beaten by qwen3.5:4b).
170
+ ```
171
+
172
+ Every number tells you what was tested and what happened. No black boxes.
173
+
174
+ ## Dashboard
175
+
176
+ ```bash
177
+ arbiter dashboard
178
+ ```
179
+
180
+ Opens a full GUI in your browser at `http://127.0.0.1:7878`. Everything you can do in the terminal, you can do from the dashboard:
181
+
182
+ - **Select models** from auto-detected installed list
183
+ - **Type prompts** and run comparisons
184
+ - **Run benchmarks** with one click
185
+ - **View results** with animated charts and graphs
186
+ - **Track rankings** across all your comparisons
187
+
188
+ ### Tabs
189
+
190
+ | Tab | What It Shows |
191
+ |---|---|
192
+ | **Arena** | Live token race, winner announcement, model output cards |
193
+ | **Benchmark** | Full test suite with category breakdown and pass/fail per test |
194
+ | **Speed** | Animated bar charts for tok/s, TTFT, total generation time |
195
+ | **Quality** | Radar chart comparing quality dimensions |
196
+ | **Graph** | Force-directed node graph showing model relationships |
197
+ | **Rankings** | Persistent ELO leaderboard with sparklines |
198
+ | **Help** | Getting started guide, FAQ, terminal command reference |
199
+
200
+ The dashboard shows your system RAM and warns if a model is too large for your machine.
201
+
202
+ ## SWE Testing
203
+
204
+ Real software engineering tests in the style of [SWE-bench](https://www.swebench.com/). Models receive buggy code + a bug report, write a fix, and the fix is verified by running pytest.
205
+
206
+ ```bash
207
+ arbiter swe
208
+ ```
209
+
210
+ 7 built-in challenges across bug fixes and algorithm implementations. Tests run in a sandboxed subprocess (no Docker required) or optionally in Docker containers with `--docker`.
211
+
212
+ | Challenge | Type | Pytest Assertions |
213
+ |---|---|---|
214
+ | Binary Search Off-by-One | Bug fix | 7 |
215
+ | LRU Cache Eviction | Bug fix | 5 |
216
+ | Flatten Nested List | Bug fix | 6 |
217
+ | Rate Limiter Window | Implementation | 4 |
218
+ | Linked List Cycle (O(1) memory) | Implementation | 6 |
219
+ | Two Sum (O(n) time) | Algorithm | 5 |
220
+ | Merge Intervals | Algorithm | 6 |
221
+
222
+ ## How Scoring Works
223
+
224
+ ### Benchmark Suite
225
+
226
+ Each test is **pass/fail** with programmatic verification. The overall score is the percentage of tests passed. Category scores are averages within each category. No LLM judges another LLM.
227
+
228
+ ### Run Comparison
229
+
230
+ When comparing models with a custom prompt, Arbiter uses a **transparent composite formula**:
231
+
232
+ ```
233
+ Score = Speed(30%) + Quality(50%) + Responsiveness(20%)
234
+ ```
235
+
236
+ - **Speed:** `model_tps / best_tps` (higher tokens/sec = better)
237
+ - **Responsiveness:** `best_ttft / model_ttft` (lower time-to-first-token = better)
238
+ - **Quality:** `model_score / 10` (from optional LLM judge, or skipped with `--no-judge`)
239
+
240
+ You see every component. You see the raw values. You see exactly why Model A scored higher than Model B.
241
+
242
+ ### ELO Leaderboard
243
+
244
+ Every comparison updates a persistent [ELO rating](https://en.wikipedia.org/wiki/Elo_rating_system) stored in `~/.arbiter/leaderboard.json`. Rankings build over time across all your comparisons. Multi-model comparisons scale the K-factor to prevent rating inflation.
245
+
246
+ ## Multi-Provider Support
247
+
248
+ Arbiter works with any model you can run:
249
+
250
+ | Provider | Format | Example |
251
+ |---|---|---|
252
+ | **Ollama** (local) | `model_name` | `qwen3.5:4b` |
253
+ | **OpenAI** | `openai:model` | `openai:gpt-4o` |
254
+ | **Anthropic** | `anthropic:model` | `anthropic:claude-sonnet-4-20250514` |
255
+ | **Google** | `google:model` | `google:gemini-2.0-flash` |
256
+ | **Any OpenAI-compatible** | `http://host:port/v1:model` | Custom endpoints |
257
+
258
+ Cloud providers require API keys via environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`).
259
+
260
+ ## Low RAM? No Problem
261
+
262
+ Arbiter was built and tested on an **8GB M1 MacBook Air**. Sequential mode runs one model at a time so each gets your full RAM:
263
+
264
+ ```bash
265
+ arbiter benchmark --seq
266
+ arbiter run "your prompt" --seq
267
+ ```
268
+
269
+ The dashboard shows system RAM and warns if a model is too large.
270
+
271
+ ## Philosophy
272
+
273
+ - **Your hardware, your results.** Lab benchmarks don't tell you how a model runs on your machine.
274
+ - **Your prompts, your answers.** Generic benchmarks don't test what you actually use AI for.
275
+ - **No vibes.** Every test has a programmatic pass/fail. No "this feels like a 7/10."
276
+ - **Fully local.** Your code and prompts never leave your computer.
277
+ - **Transparent.** See every score, every formula, every raw value. No black boxes.
278
+
279
+ ## Contributing
280
+
281
+ We welcome contributions! Areas we need help with:
282
+
283
+ - **New test packs** -- submit benchmark tests for specific domains (data science, web dev, etc.)
284
+ - **New providers** -- add support for more LLM APIs
285
+ - **Dashboard improvements** -- better visualizations, mobile responsive
286
+ - **Documentation** -- tutorials, guides, translations
287
+
288
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
289
+
290
+ ## License
291
+
292
+ MIT
293
+
294
+ ---
295
+
296
+ <p align="center">
297
+ Built by <a href="https://basaltlabs.app">BasaltLabs</a><br>
298
+ <sub>The final word on your local models.</sub>
299
+ </p>
@@ -0,0 +1,262 @@
1
+ <p align="center">
2
+ <img src="https://img.shields.io/badge/arbiter-v0.1.0-00d4ff?style=for-the-badge" alt="version" />
3
+ </p>
4
+
5
+ <h1 align="center">Arbiter</h1>
6
+
7
+ <p align="center">
8
+ <strong>The final word on your local models.</strong><br>
9
+ Real benchmarks. Real code tests. Beautiful dashboard. Fully local.
10
+ </p>
11
+
12
+ <p align="center">
13
+ <a href="#install">Install</a> &bull;
14
+ <a href="#quick-start">Quick Start</a> &bull;
15
+ <a href="#what-it-tests">What It Tests</a> &bull;
16
+ <a href="#dashboard">Dashboard</a> &bull;
17
+ <a href="#swe-testing">SWE Testing</a> &bull;
18
+ <a href="#how-scoring-works">Scoring</a>
19
+ </p>
20
+
21
+ <p align="center">
22
+ <img src="https://img.shields.io/pypi/v/arbiter-cli?color=00d4ff" alt="PyPI" />
23
+ <img src="https://img.shields.io/github/license/Basaltlabs-app/Arbiter" alt="License" />
24
+ <img src="https://img.shields.io/badge/AI-100%25%20Local-10b981" alt="Local AI" />
25
+ <img src="https://img.shields.io/badge/models-any%20Ollama%20%2B%20cloud-7c3aed" alt="Models" />
26
+ <img src="https://img.shields.io/badge/judge-no%20LLM%20vibes-f59e0b" alt="No vibes" />
27
+ </p>
28
+
29
+ ---
30
+
31
+ New models drop every week. Gemma 4. Qwen 3.5. Llama 4. But lab benchmarks don't tell you how they run **on your machine** for **your use cases**.
32
+
33
+ **Arbiter** runs real automated tests against any model, verifies results programmatically (not by asking another LLM), and shows you exactly why one model beats another.
34
+
35
+ ```bash
36
+ pip install arbiter-cli
37
+ arbiter benchmark
38
+ ```
39
+
40
+ No API keys. No cloud. Everything runs on your hardware.
41
+
42
+ ---
43
+
44
+ ## Install
45
+
46
+ ```bash
47
+ pip install arbiter-cli
48
+ ```
49
+
50
+ **Requirements:**
51
+ - Python 3.9+
52
+ - [Ollama](https://ollama.com) with at least one model installed
53
+
54
+ ```bash
55
+ # Install Ollama, then pull a model:
56
+ ollama pull qwen3.5:4b
57
+ ```
58
+
59
+ ## Quick Start
60
+
61
+ ```bash
62
+ # Open the dashboard (recommended -- everything in the browser)
63
+ arbiter dashboard
64
+
65
+ # Run the benchmark suite from terminal
66
+ arbiter benchmark
67
+
68
+ # Quick benchmark (8 key tests, faster)
69
+ arbiter benchmark --quick
70
+
71
+ # Compare models with your own prompt
72
+ arbiter run "explain how a hash table works"
73
+
74
+ # List your installed models
75
+ arbiter discover
76
+
77
+ # View persistent rankings
78
+ arbiter leaderboard
79
+ ```
80
+
81
+ The dashboard auto-detects your installed models. Pick models, type a prompt or hit "Run Benchmark", see results.
82
+
83
+ ## What It Tests
84
+
85
+ **20 automated tests across 8 categories.** Every test has a programmatic pass/fail check. No model judges another model.
86
+
87
+ | Category | What It Tests | How It Verifies |
88
+ |---|---|---|
89
+ | **Instruction Following** | Does it do exactly what you asked? | Line count, format regex, constraint check |
90
+ | **Code Generation** | Can it fix bugs and build real code? | AST parsing, structural analysis |
91
+ | **Factual Accuracy** | Real facts or hallucinations? | Ground truth match against known answers |
92
+ | **Reasoning** | Multi-step logic, math, time | Expected answer comparison |
93
+ | **Consistency** | Same question 3 ways = same answer? | Cross-run similarity scoring |
94
+ | **Pressure Resistance** | Does it cave when you push back? | Sycophancy detection (correct answer flip) |
95
+ | **Speed** | Tokens/sec on YOUR hardware | Direct measurement |
96
+ | **Context Recall** | Find info hidden in longer text | Hidden value retrieval |
97
+
98
+ ### Code Generation Tests (SWE-bench inspired)
99
+
100
+ These aren't toy FizzBuzz problems. Arbiter gives models real buggy code and checks if the fix actually works:
101
+
102
+ | Test | What the Model Must Do |
103
+ |---|---|
104
+ | **Binary Search Fix** | Find the off-by-one bug, fix it. 7 pytest assertions. |
105
+ | **Edge Case Handling** | Write safe_divide with zero, type, and negative handling |
106
+ | **Data Structure** | Build a Stack class with proper error handling |
107
+ | **API Design** | Sliding window rate limiter with time tracking |
108
+ | **Code Comprehension** | Predict output of tricky Python (slice references) |
109
+ | **JSON Output** | Generate valid, parseable structured data |
110
+
111
+ ### Example Output
112
+
113
+ ```
114
+ Overall Scores
115
+ Higher is better. 100 = perfect on every test.
116
+
117
+ gemma4:e2b ████████████████░░░░░░░░░░░░░░░░ 53/100 (5/8 passed)
118
+ qwen3.5:4b ██████░░░░░░░░░░░░░░░░░░░░░░░░░░ 21/100 (2/8 passed)
119
+
120
+ Factual Accuracy Does it give real facts or make things up?
121
+ gemma4:e2b 100% BEST
122
+ qwen3.5:4b 0%
123
+ Known factual answers: PASS FAIL
124
+
125
+ Code Generation Can it write working code?
126
+ gemma4:e2b 0%
127
+ qwen3.5:4b 50% BEST
128
+ Fix binary search bug: FAIL PASS
129
+
130
+ Winner: gemma4:e2b scored 53/100, passing 5 of 8 tests.
131
+ Strong at: factual accuracy, pressure resistance, reasoning.
132
+ Weaker at: code generation (beaten by qwen3.5:4b).
133
+ ```
134
+
135
+ Every number tells you what was tested and what happened. No black boxes.
136
+
137
+ ## Dashboard
138
+
139
+ ```bash
140
+ arbiter dashboard
141
+ ```
142
+
143
+ Opens a full GUI in your browser at `http://127.0.0.1:7878`. Everything you can do in the terminal, you can do from the dashboard:
144
+
145
+ - **Select models** from auto-detected installed list
146
+ - **Type prompts** and run comparisons
147
+ - **Run benchmarks** with one click
148
+ - **View results** with animated charts and graphs
149
+ - **Track rankings** across all your comparisons
150
+
151
+ ### Tabs
152
+
153
+ | Tab | What It Shows |
154
+ |---|---|
155
+ | **Arena** | Live token race, winner announcement, model output cards |
156
+ | **Benchmark** | Full test suite with category breakdown and pass/fail per test |
157
+ | **Speed** | Animated bar charts for tok/s, TTFT, total generation time |
158
+ | **Quality** | Radar chart comparing quality dimensions |
159
+ | **Graph** | Force-directed node graph showing model relationships |
160
+ | **Rankings** | Persistent ELO leaderboard with sparklines |
161
+ | **Help** | Getting started guide, FAQ, terminal command reference |
162
+
163
+ The dashboard shows your system RAM and warns if a model is too large for your machine.
164
+
165
+ ## SWE Testing
166
+
167
+ Real software engineering tests in the style of [SWE-bench](https://www.swebench.com/). Models receive buggy code + a bug report, write a fix, and the fix is verified by running pytest.
168
+
169
+ ```bash
170
+ arbiter swe
171
+ ```
172
+
173
+ 7 built-in challenges across bug fixes and algorithm implementations. Tests run in a sandboxed subprocess (no Docker required) or optionally in Docker containers with `--docker`.
174
+
175
+ | Challenge | Type | Pytest Assertions |
176
+ |---|---|---|
177
+ | Binary Search Off-by-One | Bug fix | 7 |
178
+ | LRU Cache Eviction | Bug fix | 5 |
179
+ | Flatten Nested List | Bug fix | 6 |
180
+ | Rate Limiter Window | Implementation | 4 |
181
+ | Linked List Cycle (O(1) memory) | Implementation | 6 |
182
+ | Two Sum (O(n) time) | Algorithm | 5 |
183
+ | Merge Intervals | Algorithm | 6 |
184
+
185
+ ## How Scoring Works
186
+
187
+ ### Benchmark Suite
188
+
189
+ Each test is **pass/fail** with programmatic verification. The overall score is the percentage of tests passed. Category scores are averages within each category. No LLM judges another LLM.
190
+
191
+ ### Run Comparison
192
+
193
+ When comparing models with a custom prompt, Arbiter uses a **transparent composite formula**:
194
+
195
+ ```
196
+ Score = Speed(30%) + Quality(50%) + Responsiveness(20%)
197
+ ```
198
+
199
+ - **Speed:** `model_tps / best_tps` (higher tokens/sec = better)
200
+ - **Responsiveness:** `best_ttft / model_ttft` (lower time-to-first-token = better)
201
+ - **Quality:** `model_score / 10` (from optional LLM judge, or skipped with `--no-judge`)
202
+
203
+ You see every component. You see the raw values. You see exactly why Model A scored higher than Model B.
204
+
205
+ ### ELO Leaderboard
206
+
207
+ Every comparison updates a persistent [ELO rating](https://en.wikipedia.org/wiki/Elo_rating_system) stored in `~/.arbiter/leaderboard.json`. Rankings build over time across all your comparisons. Multi-model comparisons scale the K-factor to prevent rating inflation.
208
+
209
+ ## Multi-Provider Support
210
+
211
+ Arbiter works with any model you can run:
212
+
213
+ | Provider | Format | Example |
214
+ |---|---|---|
215
+ | **Ollama** (local) | `model_name` | `qwen3.5:4b` |
216
+ | **OpenAI** | `openai:model` | `openai:gpt-4o` |
217
+ | **Anthropic** | `anthropic:model` | `anthropic:claude-sonnet-4-20250514` |
218
+ | **Google** | `google:model` | `google:gemini-2.0-flash` |
219
+ | **Any OpenAI-compatible** | `http://host:port/v1:model` | Custom endpoints |
220
+
221
+ Cloud providers require API keys via environment variables (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`).
222
+
223
+ ## Low RAM? No Problem
224
+
225
+ Arbiter was built and tested on an **8GB M1 MacBook Air**. Sequential mode runs one model at a time so each gets your full RAM:
226
+
227
+ ```bash
228
+ arbiter benchmark --seq
229
+ arbiter run "your prompt" --seq
230
+ ```
231
+
232
+ The dashboard shows system RAM and warns if a model is too large.
233
+
234
+ ## Philosophy
235
+
236
+ - **Your hardware, your results.** Lab benchmarks don't tell you how a model runs on your machine.
237
+ - **Your prompts, your answers.** Generic benchmarks don't test what you actually use AI for.
238
+ - **No vibes.** Every test has a programmatic pass/fail. No "this feels like a 7/10."
239
+ - **Fully local.** Your code and prompts never leave your computer.
240
+ - **Transparent.** See every score, every formula, every raw value. No black boxes.
241
+
242
+ ## Contributing
243
+
244
+ We welcome contributions! Areas we need help with:
245
+
246
+ - **New test packs** -- submit benchmark tests for specific domains (data science, web dev, etc.)
247
+ - **New providers** -- add support for more LLM APIs
248
+ - **Dashboard improvements** -- better visualizations, mobile responsive
249
+ - **Documentation** -- tutorials, guides, translations
250
+
251
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for details.
252
+
253
+ ## License
254
+
255
+ MIT
256
+
257
+ ---
258
+
259
+ <p align="center">
260
+ Built by <a href="https://basaltlabs.app">BasaltLabs</a><br>
261
+ <sub>The final word on your local models.</sub>
262
+ </p>
@@ -0,0 +1,3 @@
1
+ """Arbiter - The final word on your local models."""
2
+
3
+ __version__ = "0.1.0"
File without changes