switchboard-llm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchboard_llm-0.1.0/.gitignore +13 -0
- switchboard_llm-0.1.0/LICENSE +21 -0
- switchboard_llm-0.1.0/PKG-INFO +186 -0
- switchboard_llm-0.1.0/README.md +162 -0
- switchboard_llm-0.1.0/RESULTS.md +83 -0
- switchboard_llm-0.1.0/pyproject.toml +67 -0
- switchboard_llm-0.1.0/src/switchboard/__init__.py +57 -0
- switchboard_llm-0.1.0/src/switchboard/cache.py +56 -0
- switchboard_llm-0.1.0/src/switchboard/classify.py +142 -0
- switchboard_llm-0.1.0/src/switchboard/cli.py +106 -0
- switchboard_llm-0.1.0/src/switchboard/config.py +112 -0
- switchboard_llm-0.1.0/src/switchboard/engine.py +283 -0
- switchboard_llm-0.1.0/src/switchboard/gateway.py +124 -0
- switchboard_llm-0.1.0/src/switchboard/py.typed +0 -0
- switchboard_llm-0.1.0/src/switchboard/server.py +190 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Archit Dwivedi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: switchboard-llm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An OpenAI-compatible LLM router that saves cost without losing quality.
|
|
5
|
+
Project-URL: Homepage, https://github.com/archit0/switchboard
|
|
6
|
+
Project-URL: Repository, https://github.com/archit0/switchboard
|
|
7
|
+
Project-URL: Issues, https://github.com/archit0/switchboard/issues
|
|
8
|
+
Author: Archit Dwivedi
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: anthropic,cost,frugalgpt,gateway,gemini,llm,mixture-of-agents,openai,router
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: fastapi>=0.110
|
|
21
|
+
Requires-Dist: httpx>=0.27
|
|
22
|
+
Requires-Dist: uvicorn[standard]>=0.29
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# switchboard
|
|
26
|
+
|
|
27
|
+
[](https://github.com/archit0/switchboard/actions/workflows/ci.yml)
|
|
28
|
+
[](https://pypi.org/project/switchboard-llm/)
|
|
29
|
+
|
|
30
|
+
An **OpenAI-compatible LLM router** that saves cost without losing quality. Point
|
|
31
|
+
any OpenAI client at it and it routes each request to the cheapest model that can
|
|
32
|
+
handle it — easy prompts to a small model, hard ones to a parallel
|
|
33
|
+
**Mixture-of-Agents** — trading a little latency for large savings while holding
|
|
34
|
+
(or beating) frontier-model quality on a representative workload.
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from openai import OpenAI
|
|
38
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
39
|
+
client.chat.completions.create(model="router-cost", messages=[{"role": "user", "content": "..."}])
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
It works on top of **any OpenAI-compatible gateway that fronts multiple providers**
|
|
43
|
+
behind one key (e.g. a LiteLLM proxy) — so one client can reach OpenAI, Anthropic,
|
|
44
|
+
and Google models just by changing the `model` field. The router is a thin policy
|
|
45
|
+
on top of that.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install switchboard-llm # or: uv add switchboard-llm
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Configure your gateway (any OpenAI-compatible endpoint):
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
export OPENAI_API_KEY=... # your gateway key
|
|
59
|
+
export OPENAI_BASE_URL=https://.../v1 # your endpoint
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Use it
|
|
63
|
+
|
|
64
|
+
**As a server** (drop-in for any OpenAI client):
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
switchboard serve # http://localhost:8000/v1 (use --port to change)
|
|
68
|
+
```
|
|
69
|
+
```python
|
|
70
|
+
from openai import OpenAI
|
|
71
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
72
|
+
r = client.chat.completions.create(model="router", messages=[{"role": "user", "content": "Hi"}])
|
|
73
|
+
print(r.model_extra["switchboard"]) # route, cost, savings telemetry
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**As a library:**
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import asyncio
|
|
80
|
+
from switchboard import Engine
|
|
81
|
+
|
|
82
|
+
async def main():
|
|
83
|
+
eng = Engine()
|
|
84
|
+
rr = await eng.answer([{"role": "user", "content": "What is 17 * 23?"}], mode="cost")
|
|
85
|
+
print(rr.content, f"${rr.cost:.6f}", f"{rr.savings_pct:.0f}% cheaper than Opus")
|
|
86
|
+
await eng.aclose()
|
|
87
|
+
|
|
88
|
+
asyncio.run(main())
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**From the CLI:**
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
switchboard ask "Prove sqrt(2) is irrational" --mode quality
|
|
95
|
+
switchboard models # probe which gateway models are actually live
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## The honest thesis (read this first)
|
|
101
|
+
|
|
102
|
+
The goal is a router that is **cheaper than a frontier model (e.g. Opus) and
|
|
103
|
+
matches-or-beats it on benchmarks**. That is achievable — but only as a
|
|
104
|
+
**portfolio result over a realistic workload**, not a per-query miracle. The iron
|
|
105
|
+
law:
|
|
106
|
+
|
|
107
|
+
> On a *single hard query*, you cannot both beat the frontier model **and** be
|
|
108
|
+
> cheaper than it on that same query.
|
|
109
|
+
|
|
110
|
+
What you *can* do, and what this does:
|
|
111
|
+
|
|
112
|
+
| Traffic | What the router does | Outcome |
|
|
113
|
+
|---|---|---|
|
|
114
|
+
| **Easy queries** (most real traffic) | route to a cheap model | quality ties Opus, **5–50× cheaper** |
|
|
115
|
+
| **Hard queries** (the minority) | **Mixture-of-Agents**: several cheap/mid models answer in parallel, a synthesizer fuses them | quality can **match or exceed** a single Opus call, still **< Opus cost** |
|
|
116
|
+
| **Repeats** | exact-match cache | **free** |
|
|
117
|
+
|
|
118
|
+
Averaged over the workload, total spend is well below always-Opus and mean
|
|
119
|
+
accuracy is **equal-or-better**. Grounded in **RouteLLM**, **FrugalGPT** (cascade
|
|
120
|
+
with a judge), and **Mixture-of-Agents**.
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Modes
|
|
125
|
+
|
|
126
|
+
Pick the strategy via the `model` field:
|
|
127
|
+
|
|
128
|
+
| `model` | strategy |
|
|
129
|
+
|---|---|
|
|
130
|
+
| `router` / `router-balanced` | triage → single cheap (easy) / single mid (moderate) / Mixture-of-Agents (hard) |
|
|
131
|
+
| `router-cost` | **FrugalGPT cascade** — answer cheap, a judge scores it, escalate only if low |
|
|
132
|
+
| `router-quality` | bias one tier up — best quality while staying under Opus cost |
|
|
133
|
+
|
|
134
|
+
Any **real** model id (`claude-opus-4-8`, `gpt-5.5`, …) passes straight through, so
|
|
135
|
+
this also works as a plain multi-provider proxy.
|
|
136
|
+
|
|
137
|
+
## How it works
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
request ─► [cache] ─► [triage: how hard?] ─► [policy] ──► single cheap model (easy)
|
|
141
|
+
└─► single mid model (moderate)
|
|
142
|
+
└─► Mixture-of-Agents (hard)
|
|
143
|
+
proposers ∥ ─► synthesizer
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
- **Triage** (`src/switchboard/classify.py`) — free heuristics (length, code/math
|
|
147
|
+
markers, multi-step verbs) decide obvious cases; a tiny LLM classifier scores the
|
|
148
|
+
ambiguous middle. Output: difficulty 1–5 → tier.
|
|
149
|
+
- **Policy / execution** (`src/switchboard/engine.py`) — `single`, `moa` (parallel
|
|
150
|
+
proposers + synthesizer), or `cascade` (cheap → judge → escalate).
|
|
151
|
+
- **Cost accounting** — every response carries its internal cost, an estimate of
|
|
152
|
+
what always-Opus would have cost, and the savings %, under a `switchboard` key.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Results
|
|
157
|
+
|
|
158
|
+
On **GSM8K (50 items, exact numeric grading)**, baseline = always `claude-opus-4-8`:
|
|
159
|
+
|
|
160
|
+
| config | accuracy | total cost | vs Opus |
|
|
161
|
+
|---|---|---|---|
|
|
162
|
+
| always-Opus | 100.0% | $0.3674 | baseline |
|
|
163
|
+
| `router-cost` | **100.0%** | $0.0064 | **57× cheaper — Pareto win** |
|
|
164
|
+
| `router-quality` | 100.0% | $0.2781 | 1.3× cheaper |
|
|
165
|
+
| `router-balanced` | 92.0% | $0.0611 | 6× cheaper but lost accuracy |
|
|
166
|
+
|
|
167
|
+
Reproduce: `python -m bench.run_gsm8k --n 50 --seed 0`. Full write-up and honest
|
|
168
|
+
caveats in [`RESULTS.md`](RESULTS.md). (The verifier is what makes routing safe —
|
|
169
|
+
`router-balanced` has none and lost 8 points; `router-cost`'s judge is the fix.)
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Limitations & next steps
|
|
174
|
+
|
|
175
|
+
- **Pricing is a list-price proxy** (`src/switchboard/config.py`). Drop your real
|
|
176
|
+
rate card into `pricing.json` (`{"model": [in_per_1M, out_per_1M]}`) to override.
|
|
177
|
+
- **Triage under-detects "deceptively simple" trap questions** — `router-cost`/
|
|
178
|
+
`router-quality` compensate via the judge/MoA.
|
|
179
|
+
- **Streaming is simulated** (full answer computed, then chunked) — MoA can't
|
|
180
|
+
token-stream; only the single-model path could truly stream.
|
|
181
|
+
- **Semantic cache** (embed prompt → nearest neighbour) is not yet wired.
|
|
182
|
+
- **The gateway's `/v1/models` list may be stale** — trust `switchboard models`.
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# switchboard
|
|
2
|
+
|
|
3
|
+
[](https://github.com/archit0/switchboard/actions/workflows/ci.yml)
|
|
4
|
+
[](https://pypi.org/project/switchboard-llm/)
|
|
5
|
+
|
|
6
|
+
An **OpenAI-compatible LLM router** that saves cost without losing quality. Point
|
|
7
|
+
any OpenAI client at it and it routes each request to the cheapest model that can
|
|
8
|
+
handle it — easy prompts to a small model, hard ones to a parallel
|
|
9
|
+
**Mixture-of-Agents** — trading a little latency for large savings while holding
|
|
10
|
+
(or beating) frontier-model quality on a representative workload.
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
from openai import OpenAI
|
|
14
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
15
|
+
client.chat.completions.create(model="router-cost", messages=[{"role": "user", "content": "..."}])
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
It works on top of **any OpenAI-compatible gateway that fronts multiple providers**
|
|
19
|
+
behind one key (e.g. a LiteLLM proxy) — so one client can reach OpenAI, Anthropic,
|
|
20
|
+
and Google models just by changing the `model` field. The router is a thin policy
|
|
21
|
+
on top of that.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install switchboard-llm # or: uv add switchboard-llm
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Configure your gateway (any OpenAI-compatible endpoint):
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
export OPENAI_API_KEY=... # your gateway key
|
|
35
|
+
export OPENAI_BASE_URL=https://.../v1 # your endpoint
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Use it
|
|
39
|
+
|
|
40
|
+
**As a server** (drop-in for any OpenAI client):
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
switchboard serve # http://localhost:8000/v1 (use --port to change)
|
|
44
|
+
```
|
|
45
|
+
```python
|
|
46
|
+
from openai import OpenAI
|
|
47
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
48
|
+
r = client.chat.completions.create(model="router", messages=[{"role": "user", "content": "Hi"}])
|
|
49
|
+
print(r.model_extra["switchboard"]) # route, cost, savings telemetry
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**As a library:**
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import asyncio
|
|
56
|
+
from switchboard import Engine
|
|
57
|
+
|
|
58
|
+
async def main():
|
|
59
|
+
eng = Engine()
|
|
60
|
+
rr = await eng.answer([{"role": "user", "content": "What is 17 * 23?"}], mode="cost")
|
|
61
|
+
print(rr.content, f"${rr.cost:.6f}", f"{rr.savings_pct:.0f}% cheaper than Opus")
|
|
62
|
+
await eng.aclose()
|
|
63
|
+
|
|
64
|
+
asyncio.run(main())
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**From the CLI:**
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
switchboard ask "Prove sqrt(2) is irrational" --mode quality
|
|
71
|
+
switchboard models # probe which gateway models are actually live
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## The honest thesis (read this first)
|
|
77
|
+
|
|
78
|
+
The goal is a router that is **cheaper than a frontier model (e.g. Opus) and
|
|
79
|
+
matches-or-beats it on benchmarks**. That is achievable — but only as a
|
|
80
|
+
**portfolio result over a realistic workload**, not a per-query miracle. The iron
|
|
81
|
+
law:
|
|
82
|
+
|
|
83
|
+
> On a *single hard query*, you cannot both beat the frontier model **and** be
|
|
84
|
+
> cheaper than it on that same query.
|
|
85
|
+
|
|
86
|
+
What you *can* do, and what this does:
|
|
87
|
+
|
|
88
|
+
| Traffic | What the router does | Outcome |
|
|
89
|
+
|---|---|---|
|
|
90
|
+
| **Easy queries** (most real traffic) | route to a cheap model | quality ties Opus, **5–50× cheaper** |
|
|
91
|
+
| **Hard queries** (the minority) | **Mixture-of-Agents**: several cheap/mid models answer in parallel, a synthesizer fuses them | quality can **match or exceed** a single Opus call, still **< Opus cost** |
|
|
92
|
+
| **Repeats** | exact-match cache | **free** |
|
|
93
|
+
|
|
94
|
+
Averaged over the workload, total spend is well below always-Opus and mean
|
|
95
|
+
accuracy is **equal-or-better**. Grounded in **RouteLLM**, **FrugalGPT** (cascade
|
|
96
|
+
with a judge), and **Mixture-of-Agents**.
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Modes
|
|
101
|
+
|
|
102
|
+
Pick the strategy via the `model` field:
|
|
103
|
+
|
|
104
|
+
| `model` | strategy |
|
|
105
|
+
|---|---|
|
|
106
|
+
| `router` / `router-balanced` | triage → single cheap (easy) / single mid (moderate) / Mixture-of-Agents (hard) |
|
|
107
|
+
| `router-cost` | **FrugalGPT cascade** — answer cheap, a judge scores it, escalate only if low |
|
|
108
|
+
| `router-quality` | bias one tier up — best quality while staying under Opus cost |
|
|
109
|
+
|
|
110
|
+
Any **real** model id (`claude-opus-4-8`, `gpt-5.5`, …) passes straight through, so
|
|
111
|
+
this also works as a plain multi-provider proxy.
|
|
112
|
+
|
|
113
|
+
## How it works
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
request ─► [cache] ─► [triage: how hard?] ─► [policy] ──► single cheap model (easy)
|
|
117
|
+
└─► single mid model (moderate)
|
|
118
|
+
└─► Mixture-of-Agents (hard)
|
|
119
|
+
proposers ∥ ─► synthesizer
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
- **Triage** (`src/switchboard/classify.py`) — free heuristics (length, code/math
|
|
123
|
+
markers, multi-step verbs) decide obvious cases; a tiny LLM classifier scores the
|
|
124
|
+
ambiguous middle. Output: difficulty 1–5 → tier.
|
|
125
|
+
- **Policy / execution** (`src/switchboard/engine.py`) — `single`, `moa` (parallel
|
|
126
|
+
proposers + synthesizer), or `cascade` (cheap → judge → escalate).
|
|
127
|
+
- **Cost accounting** — every response carries its internal cost, an estimate of
|
|
128
|
+
what always-Opus would have cost, and the savings %, under a `switchboard` key.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Results
|
|
133
|
+
|
|
134
|
+
On **GSM8K (50 items, exact numeric grading)**, baseline = always `claude-opus-4-8`:
|
|
135
|
+
|
|
136
|
+
| config | accuracy | total cost | vs Opus |
|
|
137
|
+
|---|---|---|---|
|
|
138
|
+
| always-Opus | 100.0% | $0.3674 | baseline |
|
|
139
|
+
| `router-cost` | **100.0%** | $0.0064 | **57× cheaper — Pareto win** |
|
|
140
|
+
| `router-quality` | 100.0% | $0.2781 | 1.3× cheaper |
|
|
141
|
+
| `router-balanced` | 92.0% | $0.0611 | 6× cheaper but lost accuracy |
|
|
142
|
+
|
|
143
|
+
Reproduce: `python -m bench.run_gsm8k --n 50 --seed 0`. Full write-up and honest
|
|
144
|
+
caveats in [`RESULTS.md`](RESULTS.md). (The verifier is what makes routing safe —
|
|
145
|
+
`router-balanced` has none and lost 8 points; `router-cost`'s judge is the fix.)
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## Limitations & next steps
|
|
150
|
+
|
|
151
|
+
- **Pricing is a list-price proxy** (`src/switchboard/config.py`). Drop your real
|
|
152
|
+
rate card into `pricing.json` (`{"model": [in_per_1M, out_per_1M]}`) to override.
|
|
153
|
+
- **Triage under-detects "deceptively simple" trap questions** — `router-cost`/
|
|
154
|
+
`router-quality` compensate via the judge/MoA.
|
|
155
|
+
- **Streaming is simulated** (full answer computed, then chunked) — MoA can't
|
|
156
|
+
token-stream; only the single-model path could truly stream.
|
|
157
|
+
- **Semantic cache** (embed prompt → nearest neighbour) is not yet wired.
|
|
158
|
+
- **The gateway's `/v1/models` list may be stale** — trust `switchboard models`.
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# Measured results
|
|
2
|
+
|
|
3
|
+
## GSM8K — 50 items, seed 0 (recognized benchmark)
|
|
4
|
+
|
|
5
|
+
`.venv/bin/python -m bench.run_gsm8k --n 50 --seed 0`. Zero-shot chain-of-thought,
|
|
6
|
+
exact numeric grading (gold = number after `####`), no LLM judge. Baseline =
|
|
7
|
+
always `claude-opus-4-8`, same 50 items for every config.
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
config acc total$ $/correct mean_ms
|
|
11
|
+
------------------------------------------------------------------------------
|
|
12
|
+
claude-opus-4-8 100.0% 0.36744 0.007349 2094
|
|
13
|
+
router-balanced 92.0% 0.06106 0.001327 4025
|
|
14
|
+
router-cost 100.0% 0.00640 0.000128 2898
|
|
15
|
+
router-quality 100.0% 0.27812 0.005562 6784
|
|
16
|
+
------------------------------------------------------------------------------
|
|
17
|
+
router-cost acc 100.0% (+0.0pt) cost $0.00640 (57.4x cheaper) PARETO-WIN
|
|
18
|
+
router-quality acc 100.0% (+0.0pt) cost $0.27812 ( 1.3x cheaper) PARETO-WIN
|
|
19
|
+
router-balanced acc 92.0% (-8.0pt) cost $0.06106 ( 6.0x cheaper) cheaper, LOWER acc
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
**Findings (honest):**
|
|
23
|
+
- **`router-cost` (FrugalGPT cascade) is Pareto-dominant: ties Opus at 100% for
|
|
24
|
+
57× less cost.** The cheap model + judge resolves most items; only low-confidence
|
|
25
|
+
ones escalate. This is the headline result on a recognized benchmark.
|
|
26
|
+
- **`router-balanced` LOST 8 points** (4/50 items). This is the real cost of naive
|
|
27
|
+
single-model routing with *no verification* — when the mid model errs, nothing
|
|
28
|
+
catches it. The cascade's judge is precisely the fix, and the data shows it.
|
|
29
|
+
Reported, not hidden. (Actionable next step: add a light verifier to the
|
|
30
|
+
balanced mid tier.)
|
|
31
|
+
- **Magnitude caveat:** GSM8K is now *easy* for modern small models, so 57× is
|
|
32
|
+
near the optimistic end. Expect smaller-but-real savings on harder suites
|
|
33
|
+
(MMLU-Pro, GPQA) where the cheap tier carries less of the load.
|
|
34
|
+
|
|
35
|
+
---
|
|
36
|
+
|
|
37
|
+
## Mixed smoke set — 15 items
|
|
38
|
+
|
|
39
|
+
Run of `.venv/bin/python -m bench.run_bench` on the 15-item mixed set
|
|
40
|
+
(easy factual/math, CRT "trap" questions, open-ended reasoning & code).
|
|
41
|
+
Baseline = always `claude-opus-4-8`. Grader = `gemini-3.1-pro-preview`
|
|
42
|
+
(independent of the MoA pool). Costs use the list-price proxies in
|
|
43
|
+
`src/switchboard/config.py`.
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
config acc total$ $/correct mean_ms
|
|
47
|
+
------------------------------------------------------------------------------
|
|
48
|
+
claude-opus-4-8 93.3% 0.10338 0.00738 3084
|
|
49
|
+
router-balanced 93.3% 0.01162 0.00083 3138
|
|
50
|
+
router-cost 100.0% 0.01106 0.00074 4534
|
|
51
|
+
router-quality 100.0% 0.04278 0.00285 5502
|
|
52
|
+
------------------------------------------------------------------------------
|
|
53
|
+
router-balanced acc 93.3% (+0.0pt) cost $0.01162 ( 8.9x cheaper) PARETO-WIN
|
|
54
|
+
router-cost acc 100.0% (+6.7pt) cost $0.01106 ( 9.3x cheaper) PARETO-WIN
|
|
55
|
+
router-quality acc 100.0% (+6.7pt) cost $0.04278 ( 2.4x cheaper) PARETO-WIN
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Reading this honestly
|
|
59
|
+
|
|
60
|
+
- **All three router modes are Pareto wins**: equal-or-better accuracy than
|
|
61
|
+
always-Opus at 2.4×–9.3× lower cost. `router-cost` is strictly better on
|
|
62
|
+
*both* axes here.
|
|
63
|
+
- **`router-balanced` matches Opus at near-Opus latency** (3.1s vs 3.1s) because
|
|
64
|
+
most items route to a single cheap model. `router-quality` is slower (5.5s)
|
|
65
|
+
because it fires Mixture-of-Agents on more items — that is the latency you pay
|
|
66
|
+
for the parallel internal calls you asked about.
|
|
67
|
+
- **Caveats (don't over-read a small sample):**
|
|
68
|
+
- 15 items, single-vote judge → the 3 open-ended items carry grader noise.
|
|
69
|
+
Opus's (correct) √2 proof was marked wrong by the judge in this run; a
|
|
70
|
+
majority-vote judge would smooth this out.
|
|
71
|
+
- The 12 exact-match items are noise-free and every config answered all 12
|
|
72
|
+
correctly — the accuracy separation comes entirely from the judged items.
|
|
73
|
+
- Costs are **list-price proxies**, not your gateway's actual rates. Swap in the real
|
|
74
|
+
rate card via `pricing.json` to get true dollar savings.
|
|
75
|
+
|
|
76
|
+
## How to reproduce / scale up
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
.venv/bin/python -m bench.run_bench # this run
|
|
80
|
+
.venv/bin/python -m bench.run_bench --n 8 # quick smoke
|
|
81
|
+
# add your own items to bench/dataset.py to test on YOUR workload mix —
|
|
82
|
+
# the savings depend heavily on how much of your traffic is genuinely easy.
|
|
83
|
+
```
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "switchboard-llm"
|
|
7
|
+
description = "An OpenAI-compatible LLM router that saves cost without losing quality."
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
license = "MIT"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [{ name = "Archit Dwivedi" }]
|
|
12
|
+
keywords = ["llm", "router", "openai", "anthropic", "gemini", "mixture-of-agents", "frugalgpt", "cost", "gateway"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dynamic = ["version"]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"httpx>=0.27",
|
|
25
|
+
"fastapi>=0.110",
|
|
26
|
+
"uvicorn[standard]>=0.29",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
switchboard = "switchboard.cli:main"
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/archit0/switchboard"
|
|
34
|
+
Repository = "https://github.com/archit0/switchboard"
|
|
35
|
+
Issues = "https://github.com/archit0/switchboard/issues"
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=8",
|
|
40
|
+
"ruff>=0.6",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[tool.hatch.version]
|
|
44
|
+
path = "src/switchboard/__init__.py"
|
|
45
|
+
|
|
46
|
+
[tool.hatch.build.targets.wheel]
|
|
47
|
+
packages = ["src/switchboard"]
|
|
48
|
+
|
|
49
|
+
[tool.hatch.build.targets.sdist]
|
|
50
|
+
include = ["src/switchboard", "README.md", "RESULTS.md", "LICENSE"]
|
|
51
|
+
|
|
52
|
+
[tool.pytest.ini_options]
|
|
53
|
+
testpaths = ["tests"]
|
|
54
|
+
|
|
55
|
+
[tool.ruff]
|
|
56
|
+
line-length = 120
|
|
57
|
+
src = ["src", "tests", "bench", "tools"]
|
|
58
|
+
|
|
59
|
+
[tool.ruff.lint]
|
|
60
|
+
select = ["E", "F", "I", "UP", "B", "W", "C4"]
|
|
61
|
+
ignore = [
|
|
62
|
+
"E501", # long lines (lots of explanatory comments / prompts)
|
|
63
|
+
"B008", # function calls in argument defaults (FastAPI-style)
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
[tool.ruff.lint.per-file-ignores]
|
|
67
|
+
"tests/**" = ["S101"]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""switchboard — an OpenAI-compatible LLM router that saves cost without losing quality.
|
|
2
|
+
|
|
3
|
+
Point any OpenAI client at the switchboard server and it routes each request to
|
|
4
|
+
the cheapest model that can handle it — easy prompts to a small model, hard ones
|
|
5
|
+
to a parallel Mixture-of-Agents — trading a little latency for large savings while
|
|
6
|
+
holding (or beating) frontier-model quality on a representative workload.
|
|
7
|
+
|
|
8
|
+
Quickstart (library)::
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
from switchboard import Engine
|
|
12
|
+
|
|
13
|
+
async def main():
|
|
14
|
+
eng = Engine()
|
|
15
|
+
result = await eng.answer(
|
|
16
|
+
[{"role": "user", "content": "What is 17 * 23?"}],
|
|
17
|
+
mode="cost",
|
|
18
|
+
)
|
|
19
|
+
print(result.content, result.cost, result.savings_pct)
|
|
20
|
+
await eng.aclose()
|
|
21
|
+
|
|
22
|
+
asyncio.run(main())
|
|
23
|
+
|
|
24
|
+
Quickstart (OpenAI-compatible server)::
|
|
25
|
+
|
|
26
|
+
$ switchboard serve # http://localhost:8000/v1
|
|
27
|
+
|
|
28
|
+
from openai import OpenAI
|
|
29
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
30
|
+
client.chat.completions.create(model="router-cost", messages=[...])
|
|
31
|
+
|
|
32
|
+
The gateway is configured via the ``OPENAI_BASE_URL`` and ``OPENAI_API_KEY``
|
|
33
|
+
environment variables (any OpenAI-compatible endpoint that fronts multiple
|
|
34
|
+
providers — e.g. a LiteLLM proxy — works).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from switchboard.cache import ResponseCache
|
|
38
|
+
from switchboard.classify import Triage, triage
|
|
39
|
+
from switchboard.config import GatewayConfig, cost_usd, price_of
|
|
40
|
+
from switchboard.engine import Engine, RouteResult
|
|
41
|
+
from switchboard.gateway import Completion, Gateway
|
|
42
|
+
|
|
43
|
+
__version__ = "0.1.0"
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"Completion",
|
|
47
|
+
"Engine",
|
|
48
|
+
"Gateway",
|
|
49
|
+
"GatewayConfig",
|
|
50
|
+
"ResponseCache",
|
|
51
|
+
"RouteResult",
|
|
52
|
+
"Triage",
|
|
53
|
+
"__version__",
|
|
54
|
+
"cost_usd",
|
|
55
|
+
"price_of",
|
|
56
|
+
"triage",
|
|
57
|
+
]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Exact-match response cache.
|
|
2
|
+
|
|
3
|
+
The cheapest API call is the one you never make. Repeated/identical prompts
|
|
4
|
+
(very common in agent loops and eval harnesses) return instantly at zero cost.
|
|
5
|
+
A semantic cache (embed the prompt, nearest-neighbour over past prompts) is the
|
|
6
|
+
natural next step — the gateway exposes `gemini-embedding-*` and
|
|
7
|
+
`text-embedding-3-*` for exactly this — but exact-match already captures the
|
|
8
|
+
biggest, safest wins without false hits.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _key(messages: list[dict], mode: str) -> str:
|
|
21
|
+
blob = json.dumps({"m": messages, "mode": mode}, sort_keys=True, ensure_ascii=False)
|
|
22
|
+
return hashlib.sha256(blob.encode("utf-8")).hexdigest()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ResponseCache:
|
|
26
|
+
def __init__(self, max_items: int = 4096, ttl_seconds: float | None = None):
|
|
27
|
+
self._store: dict[str, tuple[float, Any]] = {}
|
|
28
|
+
self._lock = threading.Lock()
|
|
29
|
+
self._max = max_items
|
|
30
|
+
self._ttl = ttl_seconds
|
|
31
|
+
self.hits = 0
|
|
32
|
+
self.misses = 0
|
|
33
|
+
|
|
34
|
+
def get(self, messages: list[dict], mode: str) -> Any | None:
|
|
35
|
+
k = _key(messages, mode)
|
|
36
|
+
with self._lock:
|
|
37
|
+
item = self._store.get(k)
|
|
38
|
+
if item is None:
|
|
39
|
+
self.misses += 1
|
|
40
|
+
return None
|
|
41
|
+
ts, val = item
|
|
42
|
+
if self._ttl is not None and (time.time() - ts) > self._ttl:
|
|
43
|
+
del self._store[k]
|
|
44
|
+
self.misses += 1
|
|
45
|
+
return None
|
|
46
|
+
self.hits += 1
|
|
47
|
+
return val
|
|
48
|
+
|
|
49
|
+
def put(self, messages: list[dict], mode: str, value: Any) -> None:
|
|
50
|
+
k = _key(messages, mode)
|
|
51
|
+
with self._lock:
|
|
52
|
+
if len(self._store) >= self._max and k not in self._store:
|
|
53
|
+
# drop oldest
|
|
54
|
+
oldest = min(self._store, key=lambda x: self._store[x][0])
|
|
55
|
+
del self._store[oldest]
|
|
56
|
+
self._store[k] = (time.time(), value)
|