switchboard-llm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .secrets/
5
+ .env
6
+ .env.local
7
+ bench/data/
8
+ dist/
9
+ build/
10
+ *.egg-info/
11
+ .ruff_cache/
12
+ .pytest_cache/
13
+ uv.lock
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Archit Dwivedi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: switchboard-llm
3
+ Version: 0.1.0
4
+ Summary: An OpenAI-compatible LLM router that saves cost without losing quality.
5
+ Project-URL: Homepage, https://github.com/archit0/switchboard
6
+ Project-URL: Repository, https://github.com/archit0/switchboard
7
+ Project-URL: Issues, https://github.com/archit0/switchboard/issues
8
+ Author: Archit Dwivedi
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: anthropic,cost,frugalgpt,gateway,gemini,llm,mixture-of-agents,openai,router
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: fastapi>=0.110
21
+ Requires-Dist: httpx>=0.27
22
+ Requires-Dist: uvicorn[standard]>=0.29
23
+ Description-Content-Type: text/markdown
24
+
25
+ # switchboard
26
+
27
+ [![CI](https://github.com/archit0/switchboard/actions/workflows/ci.yml/badge.svg)](https://github.com/archit0/switchboard/actions/workflows/ci.yml)
28
+ [![PyPI](https://img.shields.io/pypi/v/switchboard-llm.svg)](https://pypi.org/project/switchboard-llm/)
29
+
30
+ An **OpenAI-compatible LLM router** that saves cost without losing quality. Point
31
+ any OpenAI client at it and it routes each request to the cheapest model that can
32
+ handle it — easy prompts to a small model, hard ones to a parallel
33
+ **Mixture-of-Agents** — trading a little latency for large savings while holding
34
+ (or beating) frontier-model quality on a representative workload.
35
+
36
+ ```python
37
+ from openai import OpenAI
38
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
39
+ client.chat.completions.create(model="router-cost", messages=[{"role": "user", "content": "..."}])
40
+ ```
41
+
42
+ It works on top of **any OpenAI-compatible gateway that fronts multiple providers**
43
+ behind one key (e.g. a LiteLLM proxy) — so one client can reach OpenAI, Anthropic,
44
+ and Google models just by changing the `model` field. The router is a thin policy
45
+ on top of that.
46
+
47
+ ---
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install switchboard-llm # or: uv add switchboard-llm
53
+ ```
54
+
55
+ Configure your gateway (any OpenAI-compatible endpoint):
56
+
57
+ ```bash
58
+ export OPENAI_API_KEY=... # your gateway key
59
+ export OPENAI_BASE_URL=https://.../v1 # your endpoint
60
+ ```
61
+
62
+ ## Use it
63
+
64
+ **As a server** (drop-in for any OpenAI client):
65
+
66
+ ```bash
67
+ switchboard serve # http://localhost:8000/v1 (use --port to change)
68
+ ```
69
+ ```python
70
+ from openai import OpenAI
71
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
72
+ r = client.chat.completions.create(model="router", messages=[{"role": "user", "content": "Hi"}])
73
+ print(r.model_extra["switchboard"]) # route, cost, savings telemetry
74
+ ```
75
+
76
+ **As a library:**
77
+
78
+ ```python
79
+ import asyncio
80
+ from switchboard import Engine
81
+
82
+ async def main():
83
+ eng = Engine()
84
+ rr = await eng.answer([{"role": "user", "content": "What is 17 * 23?"}], mode="cost")
85
+ print(rr.content, f"${rr.cost:.6f}", f"{rr.savings_pct:.0f}% cheaper than Opus")
86
+ await eng.aclose()
87
+
88
+ asyncio.run(main())
89
+ ```
90
+
91
+ **From the CLI:**
92
+
93
+ ```bash
94
+ switchboard ask "Prove sqrt(2) is irrational" --mode quality
95
+ switchboard models # probe which gateway models are actually live
96
+ ```
97
+
98
+ ---
99
+
100
+ ## The honest thesis (read this first)
101
+
102
+ The goal is a router that is **cheaper than a frontier model (e.g. Opus) and
103
+ matches-or-beats it on benchmarks**. That is achievable — but only as a
104
+ **portfolio result over a realistic workload**, not a per-query miracle. The iron
105
+ law:
106
+
107
+ > On a *single hard query*, you cannot both beat the frontier model **and** be
108
+ > cheaper than it on that same query.
109
+
110
+ What you *can* do, and what this does:
111
+
112
+ | Traffic | What the router does | Outcome |
113
+ |---|---|---|
114
+ | **Easy queries** (most real traffic) | route to a cheap model | quality ties Opus, **5–50× cheaper** |
115
+ | **Hard queries** (the minority) | **Mixture-of-Agents**: several cheap/mid models answer in parallel, a synthesizer fuses them | quality can **match or exceed** a single Opus call, still **< Opus cost** |
116
+ | **Repeats** | exact-match cache | **free** |
117
+
118
+ Averaged over the workload, total spend is well below always-Opus and mean
119
+ accuracy is **equal-or-better**. Grounded in **RouteLLM**, **FrugalGPT** (cascade
120
+ with a judge), and **Mixture-of-Agents**.
121
+
122
+ ---
123
+
124
+ ## Modes
125
+
126
+ Pick the strategy via the `model` field:
127
+
128
+ | `model` | strategy |
129
+ |---|---|
130
+ | `router` / `router-balanced` | triage → single cheap (easy) / single mid (moderate) / Mixture-of-Agents (hard) |
131
+ | `router-cost` | **FrugalGPT cascade** — answer cheap, a judge scores it, escalate only if low |
132
+ | `router-quality` | bias one tier up — best quality while staying under Opus cost |
133
+
134
+ Any **real** model id (`claude-opus-4-8`, `gpt-5.5`, …) passes straight through, so
135
+ this also works as a plain multi-provider proxy.
136
+
137
+ ## How it works
138
+
139
+ ```
140
+ request ─► [cache] ─► [triage: how hard?] ─► [policy] ──► single cheap model (easy)
141
+ └─► single mid model (moderate)
142
+ └─► Mixture-of-Agents (hard)
143
+ proposers ∥ ─► synthesizer
144
+ ```
145
+
146
+ - **Triage** (`src/switchboard/classify.py`) — free heuristics (length, code/math
147
+ markers, multi-step verbs) decide obvious cases; a tiny LLM classifier scores the
148
+ ambiguous middle. Output: difficulty 1–5 → tier.
149
+ - **Policy / execution** (`src/switchboard/engine.py`) — `single`, `moa` (parallel
150
+ proposers + synthesizer), or `cascade` (cheap → judge → escalate).
151
+ - **Cost accounting** — every response carries its internal cost, an estimate of
152
+ what always-Opus would have cost, and the savings %, under a `switchboard` key.
153
+
154
+ ---
155
+
156
+ ## Results
157
+
158
+ On **GSM8K (50 items, exact numeric grading)**, baseline = always `claude-opus-4-8`:
159
+
160
+ | config | accuracy | total cost | vs Opus |
161
+ |---|---|---|---|
162
+ | always-Opus | 100.0% | $0.3674 | baseline |
163
+ | `router-cost` | **100.0%** | $0.0064 | **57× cheaper — Pareto win** |
164
+ | `router-quality` | 100.0% | $0.2781 | 1.3× cheaper |
165
+ | `router-balanced` | 92.0% | $0.0611 | 6× cheaper but lost accuracy |
166
+
167
+ Reproduce: `python -m bench.run_gsm8k --n 50 --seed 0`. Full write-up and honest
168
+ caveats in [`RESULTS.md`](RESULTS.md). (The verifier is what makes routing safe —
169
+ `router-balanced` has none and lost 8 points; `router-cost`'s judge is the fix.)
170
+
171
+ ---
172
+
173
+ ## Limitations & next steps
174
+
175
+ - **Pricing is a list-price proxy** (`src/switchboard/config.py`). Drop your real
176
+ rate card into `pricing.json` (`{"model": [in_per_1M, out_per_1M]}`) to override.
177
+ - **Triage under-detects "deceptively simple" trap questions** — `router-cost`/
178
+ `router-quality` compensate via the judge/MoA.
179
+ - **Streaming is simulated** (full answer computed, then chunked) — MoA can't
180
+ token-stream; only the single-model path could truly stream.
181
+ - **Semantic cache** (embed prompt → nearest neighbour) is not yet wired.
182
+ - **The gateway's `/v1/models` list may be stale** — trust `switchboard models`.
183
+
184
+ ## License
185
+
186
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,162 @@
1
+ # switchboard
2
+
3
+ [![CI](https://github.com/archit0/switchboard/actions/workflows/ci.yml/badge.svg)](https://github.com/archit0/switchboard/actions/workflows/ci.yml)
4
+ [![PyPI](https://img.shields.io/pypi/v/switchboard-llm.svg)](https://pypi.org/project/switchboard-llm/)
5
+
6
+ An **OpenAI-compatible LLM router** that saves cost without losing quality. Point
7
+ any OpenAI client at it and it routes each request to the cheapest model that can
8
+ handle it — easy prompts to a small model, hard ones to a parallel
9
+ **Mixture-of-Agents** — trading a little latency for large savings while holding
10
+ (or beating) frontier-model quality on a representative workload.
11
+
12
+ ```python
13
+ from openai import OpenAI
14
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
15
+ client.chat.completions.create(model="router-cost", messages=[{"role": "user", "content": "..."}])
16
+ ```
17
+
18
+ It works on top of **any OpenAI-compatible gateway that fronts multiple providers**
19
+ behind one key (e.g. a LiteLLM proxy) — so one client can reach OpenAI, Anthropic,
20
+ and Google models just by changing the `model` field. The router is a thin policy
21
+ on top of that.
22
+
23
+ ---
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ pip install switchboard-llm # or: uv add switchboard-llm
29
+ ```
30
+
31
+ Configure your gateway (any OpenAI-compatible endpoint):
32
+
33
+ ```bash
34
+ export OPENAI_API_KEY=... # your gateway key
35
+ export OPENAI_BASE_URL=https://.../v1 # your endpoint
36
+ ```
37
+
38
+ ## Use it
39
+
40
+ **As a server** (drop-in for any OpenAI client):
41
+
42
+ ```bash
43
+ switchboard serve # http://localhost:8000/v1 (use --port to change)
44
+ ```
45
+ ```python
46
+ from openai import OpenAI
47
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
48
+ r = client.chat.completions.create(model="router", messages=[{"role": "user", "content": "Hi"}])
49
+ print(r.model_extra["switchboard"]) # route, cost, savings telemetry
50
+ ```
51
+
52
+ **As a library:**
53
+
54
+ ```python
55
+ import asyncio
56
+ from switchboard import Engine
57
+
58
+ async def main():
59
+ eng = Engine()
60
+ rr = await eng.answer([{"role": "user", "content": "What is 17 * 23?"}], mode="cost")
61
+ print(rr.content, f"${rr.cost:.6f}", f"{rr.savings_pct:.0f}% cheaper than Opus")
62
+ await eng.aclose()
63
+
64
+ asyncio.run(main())
65
+ ```
66
+
67
+ **From the CLI:**
68
+
69
+ ```bash
70
+ switchboard ask "Prove sqrt(2) is irrational" --mode quality
71
+ switchboard models # probe which gateway models are actually live
72
+ ```
73
+
74
+ ---
75
+
76
+ ## The honest thesis (read this first)
77
+
78
+ The goal is a router that is **cheaper than a frontier model (e.g. Opus) and
79
+ matches-or-beats it on benchmarks**. That is achievable — but only as a
80
+ **portfolio result over a realistic workload**, not a per-query miracle. The iron
81
+ law:
82
+
83
+ > On a *single hard query*, you cannot both beat the frontier model **and** be
84
+ > cheaper than it on that same query.
85
+
86
+ What you *can* do, and what this does:
87
+
88
+ | Traffic | What the router does | Outcome |
89
+ |---|---|---|
90
+ | **Easy queries** (most real traffic) | route to a cheap model | quality ties Opus, **5–50× cheaper** |
91
+ | **Hard queries** (the minority) | **Mixture-of-Agents**: several cheap/mid models answer in parallel, a synthesizer fuses them | quality can **match or exceed** a single Opus call, still **< Opus cost** |
92
+ | **Repeats** | exact-match cache | **free** |
93
+
94
+ Averaged over the workload, total spend is well below always-Opus and mean
95
+ accuracy is **equal-or-better**. Grounded in **RouteLLM**, **FrugalGPT** (cascade
96
+ with a judge), and **Mixture-of-Agents**.
97
+
98
+ ---
99
+
100
+ ## Modes
101
+
102
+ Pick the strategy via the `model` field:
103
+
104
+ | `model` | strategy |
105
+ |---|---|
106
+ | `router` / `router-balanced` | triage → single cheap (easy) / single mid (moderate) / Mixture-of-Agents (hard) |
107
+ | `router-cost` | **FrugalGPT cascade** — answer cheap, a judge scores it, escalate only if low |
108
+ | `router-quality` | bias one tier up — best quality while staying under Opus cost |
109
+
110
+ Any **real** model id (`claude-opus-4-8`, `gpt-5.5`, …) passes straight through, so
111
+ this also works as a plain multi-provider proxy.
112
+
113
+ ## How it works
114
+
115
+ ```
116
+ request ─► [cache] ─► [triage: how hard?] ─► [policy] ──► single cheap model (easy)
117
+ └─► single mid model (moderate)
118
+ └─► Mixture-of-Agents (hard)
119
+ proposers ∥ ─► synthesizer
120
+ ```
121
+
122
+ - **Triage** (`src/switchboard/classify.py`) — free heuristics (length, code/math
123
+ markers, multi-step verbs) decide obvious cases; a tiny LLM classifier scores the
124
+ ambiguous middle. Output: difficulty 1–5 → tier.
125
+ - **Policy / execution** (`src/switchboard/engine.py`) — `single`, `moa` (parallel
126
+ proposers + synthesizer), or `cascade` (cheap → judge → escalate).
127
+ - **Cost accounting** — every response carries its internal cost, an estimate of
128
+ what always-Opus would have cost, and the savings %, under a `switchboard` key.
129
+
130
+ ---
131
+
132
+ ## Results
133
+
134
+ On **GSM8K (50 items, exact numeric grading)**, baseline = always `claude-opus-4-8`:
135
+
136
+ | config | accuracy | total cost | vs Opus |
137
+ |---|---|---|---|
138
+ | always-Opus | 100.0% | $0.3674 | baseline |
139
+ | `router-cost` | **100.0%** | $0.0064 | **57× cheaper — Pareto win** |
140
+ | `router-quality` | 100.0% | $0.2781 | 1.3× cheaper |
141
+ | `router-balanced` | 92.0% | $0.0611 | 6× cheaper but lost accuracy |
142
+
143
+ Reproduce: `python -m bench.run_gsm8k --n 50 --seed 0`. Full write-up and honest
144
+ caveats in [`RESULTS.md`](RESULTS.md). (The verifier is what makes routing safe —
145
+ `router-balanced` has none and lost 8 points; `router-cost`'s judge is the fix.)
146
+
147
+ ---
148
+
149
+ ## Limitations & next steps
150
+
151
+ - **Pricing is a list-price proxy** (`src/switchboard/config.py`). Drop your real
152
+ rate card into `pricing.json` (`{"model": [in_per_1M, out_per_1M]}`) to override.
153
+ - **Triage under-detects "deceptively simple" trap questions** — `router-cost`/
154
+ `router-quality` compensate via the judge/MoA.
155
+ - **Streaming is simulated** (full answer computed, then chunked) — MoA can't
156
+ token-stream; only the single-model path could truly stream.
157
+ - **Semantic cache** (embed prompt → nearest neighbour) is not yet wired.
158
+ - **The gateway's `/v1/models` list may be stale** — trust `switchboard models`.
159
+
160
+ ## License
161
+
162
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,83 @@
1
+ # Measured results
2
+
3
+ ## GSM8K — 50 items, seed 0 (recognized benchmark)
4
+
5
+ `.venv/bin/python -m bench.run_gsm8k --n 50 --seed 0`. Zero-shot chain-of-thought,
6
+ exact numeric grading (gold = number after `####`), no LLM judge. Baseline =
7
+ always `claude-opus-4-8`, same 50 items for every config.
8
+
9
+ ```
10
+ config acc total$ $/correct mean_ms
11
+ ------------------------------------------------------------------------------
12
+ claude-opus-4-8 100.0% 0.36744 0.007349 2094
13
+ router-balanced 92.0% 0.06106 0.001327 4025
14
+ router-cost 100.0% 0.00640 0.000128 2898
15
+ router-quality 100.0% 0.27812 0.005562 6784
16
+ ------------------------------------------------------------------------------
17
+ router-cost acc 100.0% (+0.0pt) cost $0.00640 (57.4x cheaper) PARETO-WIN
18
+ router-quality acc 100.0% (+0.0pt) cost $0.27812 ( 1.3x cheaper) PARETO-WIN
19
+ router-balanced acc 92.0% (-8.0pt) cost $0.06106 ( 6.0x cheaper) cheaper, LOWER acc
20
+ ```
21
+
22
+ **Findings (honest):**
23
+ - **`router-cost` (FrugalGPT cascade) is Pareto-dominant: ties Opus at 100% for
24
+ 57× less cost.** The cheap model + judge resolves most items; only low-confidence
25
+ ones escalate. This is the headline result on a recognized benchmark.
26
+ - **`router-balanced` LOST 8 points** (4/50 items). This is the real cost of naive
27
+ single-model routing with *no verification* — when the mid model errs, nothing
28
+ catches it. The cascade's judge is precisely the fix, and the data shows it.
29
+ Reported, not hidden. (Actionable next step: add a light verifier to the
30
+ balanced mid tier.)
31
+ - **Magnitude caveat:** GSM8K is now *easy* for modern small models, so 57× is
32
+ near the optimistic end. Expect smaller-but-real savings on harder suites
33
+ (MMLU-Pro, GPQA) where the cheap tier carries less of the load.
34
+
35
+ ---
36
+
37
+ ## Mixed smoke set — 15 items
38
+
39
+ Run of `.venv/bin/python -m bench.run_bench` on the 15-item mixed set
40
+ (easy factual/math, CRT "trap" questions, open-ended reasoning & code).
41
+ Baseline = always `claude-opus-4-8`. Grader = `gemini-3.1-pro-preview`
42
+ (independent of the MoA pool). Costs use the list-price proxies in
43
+ `src/switchboard/config.py`.
44
+
45
+ ```
46
+ config acc total$ $/correct mean_ms
47
+ ------------------------------------------------------------------------------
48
+ claude-opus-4-8 93.3% 0.10338 0.00738 3084
49
+ router-balanced 93.3% 0.01162 0.00083 3138
50
+ router-cost 100.0% 0.01106 0.00074 4534
51
+ router-quality 100.0% 0.04278 0.00285 5502
52
+ ------------------------------------------------------------------------------
53
+ router-balanced acc 93.3% (+0.0pt) cost $0.01162 ( 8.9x cheaper) PARETO-WIN
54
+ router-cost acc 100.0% (+6.7pt) cost $0.01106 ( 9.3x cheaper) PARETO-WIN
55
+ router-quality acc 100.0% (+6.7pt) cost $0.04278 ( 2.4x cheaper) PARETO-WIN
56
+ ```
57
+
58
+ ## Reading this honestly
59
+
60
+ - **All three router modes are Pareto wins**: equal-or-better accuracy than
61
+ always-Opus at 2.4×–9.3× lower cost. `router-cost` is strictly better on
62
+ *both* axes here.
63
+ - **`router-balanced` matches Opus at near-Opus latency** (3.1s vs 3.1s) because
64
+ most items route to a single cheap model. `router-quality` is slower (5.5s)
65
+ because it fires Mixture-of-Agents on more items — that is the latency you pay
66
+ for the parallel internal calls you asked about.
67
+ - **Caveats (don't over-read a small sample):**
68
+ - 15 items, single-vote judge → the 3 open-ended items carry grader noise.
69
+ Opus's (correct) √2 proof was marked wrong by the judge in this run; a
70
+ majority-vote judge would smooth this out.
71
+ - The 12 exact-match items are noise-free and every config answered all 12
72
+ correctly — the accuracy separation comes entirely from the judged items.
73
+ - Costs are **list-price proxies**, not your gateway's actual rates. Swap in the real
74
+ rate card via `pricing.json` to get true dollar savings.
75
+
76
+ ## How to reproduce / scale up
77
+
78
+ ```bash
79
+ .venv/bin/python -m bench.run_bench # this run
80
+ .venv/bin/python -m bench.run_bench --n 8 # quick smoke
81
+ # add your own items to bench/dataset.py to test on YOUR workload mix —
82
+ # the savings depend heavily on how much of your traffic is genuinely easy.
83
+ ```
@@ -0,0 +1,67 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "switchboard-llm"
7
+ description = "An OpenAI-compatible LLM router that saves cost without losing quality."
8
+ readme = "README.md"
9
+ license = "MIT"
10
+ requires-python = ">=3.11"
11
+ authors = [{ name = "Archit Dwivedi" }]
12
+ keywords = ["llm", "router", "openai", "anthropic", "gemini", "mixture-of-agents", "frugalgpt", "cost", "gateway"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ ]
22
+ dynamic = ["version"]
23
+ dependencies = [
24
+ "httpx>=0.27",
25
+ "fastapi>=0.110",
26
+ "uvicorn[standard]>=0.29",
27
+ ]
28
+
29
+ [project.scripts]
30
+ switchboard = "switchboard.cli:main"
31
+
32
+ [project.urls]
33
+ Homepage = "https://github.com/archit0/switchboard"
34
+ Repository = "https://github.com/archit0/switchboard"
35
+ Issues = "https://github.com/archit0/switchboard/issues"
36
+
37
+ [dependency-groups]
38
+ dev = [
39
+ "pytest>=8",
40
+ "ruff>=0.6",
41
+ ]
42
+
43
+ [tool.hatch.version]
44
+ path = "src/switchboard/__init__.py"
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["src/switchboard"]
48
+
49
+ [tool.hatch.build.targets.sdist]
50
+ include = ["src/switchboard", "README.md", "RESULTS.md", "LICENSE"]
51
+
52
+ [tool.pytest.ini_options]
53
+ testpaths = ["tests"]
54
+
55
+ [tool.ruff]
56
+ line-length = 120
57
+ src = ["src", "tests", "bench", "tools"]
58
+
59
+ [tool.ruff.lint]
60
+ select = ["E", "F", "I", "UP", "B", "W", "C4"]
61
+ ignore = [
62
+ "E501", # long lines (lots of explanatory comments / prompts)
63
+ "B008", # function calls in argument defaults (FastAPI-style)
64
+ ]
65
+
66
+ [tool.ruff.lint.per-file-ignores]
67
+ "tests/**" = ["S101"]
@@ -0,0 +1,57 @@
1
+ """switchboard — an OpenAI-compatible LLM router that saves cost without losing quality.
2
+
3
+ Point any OpenAI client at the switchboard server and it routes each request to
4
+ the cheapest model that can handle it — easy prompts to a small model, hard ones
5
+ to a parallel Mixture-of-Agents — trading a little latency for large savings while
6
+ holding (or beating) frontier-model quality on a representative workload.
7
+
8
+ Quickstart (library)::
9
+
10
+ import asyncio
11
+ from switchboard import Engine
12
+
13
+ async def main():
14
+ eng = Engine()
15
+ result = await eng.answer(
16
+ [{"role": "user", "content": "What is 17 * 23?"}],
17
+ mode="cost",
18
+ )
19
+ print(result.content, result.cost, result.savings_pct)
20
+ await eng.aclose()
21
+
22
+ asyncio.run(main())
23
+
24
+ Quickstart (OpenAI-compatible server)::
25
+
26
+ $ switchboard serve # http://localhost:8000/v1
27
+
28
+ from openai import OpenAI
29
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
30
+ client.chat.completions.create(model="router-cost", messages=[...])
31
+
32
+ The gateway is configured via the ``OPENAI_BASE_URL`` and ``OPENAI_API_KEY``
33
+ environment variables (any OpenAI-compatible endpoint that fronts multiple
34
+ providers — e.g. a LiteLLM proxy — works).
35
+ """
36
+
37
+ from switchboard.cache import ResponseCache
38
+ from switchboard.classify import Triage, triage
39
+ from switchboard.config import GatewayConfig, cost_usd, price_of
40
+ from switchboard.engine import Engine, RouteResult
41
+ from switchboard.gateway import Completion, Gateway
42
+
43
+ __version__ = "0.1.0"
44
+
45
+ __all__ = [
46
+ "Completion",
47
+ "Engine",
48
+ "Gateway",
49
+ "GatewayConfig",
50
+ "ResponseCache",
51
+ "RouteResult",
52
+ "Triage",
53
+ "__version__",
54
+ "cost_usd",
55
+ "price_of",
56
+ "triage",
57
+ ]
@@ -0,0 +1,56 @@
1
+ """Exact-match response cache.
2
+
3
+ The cheapest API call is the one you never make. Repeated/identical prompts
4
+ (very common in agent loops and eval harnesses) return instantly at zero cost.
5
+ A semantic cache (embed the prompt, nearest-neighbour over past prompts) is the
6
+ natural next step — the gateway exposes `gemini-embedding-*` and
7
+ `text-embedding-3-*` for exactly this — but exact-match already captures the
8
+ biggest, safest wins without false hits.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import json
15
+ import threading
16
+ import time
17
+ from typing import Any
18
+
19
+
20
+ def _key(messages: list[dict], mode: str) -> str:
21
+ blob = json.dumps({"m": messages, "mode": mode}, sort_keys=True, ensure_ascii=False)
22
+ return hashlib.sha256(blob.encode("utf-8")).hexdigest()
23
+
24
+
25
+ class ResponseCache:
26
+ def __init__(self, max_items: int = 4096, ttl_seconds: float | None = None):
27
+ self._store: dict[str, tuple[float, Any]] = {}
28
+ self._lock = threading.Lock()
29
+ self._max = max_items
30
+ self._ttl = ttl_seconds
31
+ self.hits = 0
32
+ self.misses = 0
33
+
34
+ def get(self, messages: list[dict], mode: str) -> Any | None:
35
+ k = _key(messages, mode)
36
+ with self._lock:
37
+ item = self._store.get(k)
38
+ if item is None:
39
+ self.misses += 1
40
+ return None
41
+ ts, val = item
42
+ if self._ttl is not None and (time.time() - ts) > self._ttl:
43
+ del self._store[k]
44
+ self.misses += 1
45
+ return None
46
+ self.hits += 1
47
+ return val
48
+
49
+ def put(self, messages: list[dict], mode: str, value: Any) -> None:
50
+ k = _key(messages, mode)
51
+ with self._lock:
52
+ if len(self._store) >= self._max and k not in self._store:
53
+ # drop oldest
54
+ oldest = min(self._store, key=lambda x: self._store[x][0])
55
+ del self._store[oldest]
56
+ self._store[k] = (time.time(), value)