switchboard-llm 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ """switchboard — an OpenAI-compatible LLM router that saves cost without losing quality.
2
+
3
+ Point any OpenAI client at the switchboard server and it routes each request to
4
+ the cheapest model that can handle it — easy prompts to a small model, hard ones
5
+ to a parallel Mixture-of-Agents — trading a little latency for large savings while
6
+ holding (or beating) frontier-model quality on a representative workload.
7
+
8
+ Quickstart (library)::
9
+
10
+ import asyncio
11
+ from switchboard import Engine
12
+
13
+ async def main():
14
+ eng = Engine()
15
+ result = await eng.answer(
16
+ [{"role": "user", "content": "What is 17 * 23?"}],
17
+ mode="cost",
18
+ )
19
+ print(result.content, result.cost, result.savings_pct)
20
+ await eng.aclose()
21
+
22
+ asyncio.run(main())
23
+
24
+ Quickstart (OpenAI-compatible server)::
25
+
26
+ $ switchboard serve # http://localhost:8000/v1
27
+
28
+ from openai import OpenAI
29
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
30
+ client.chat.completions.create(model="router-cost", messages=[...])
31
+
32
+ The gateway is configured via the ``OPENAI_BASE_URL`` and ``OPENAI_API_KEY``
33
+ environment variables (any OpenAI-compatible endpoint that fronts multiple
34
+ providers — e.g. a LiteLLM proxy — works).
35
+ """
36
+
37
+ from switchboard.cache import ResponseCache
38
+ from switchboard.classify import Triage, triage
39
+ from switchboard.config import GatewayConfig, cost_usd, price_of
40
+ from switchboard.engine import Engine, RouteResult
41
+ from switchboard.gateway import Completion, Gateway
42
+
43
+ __version__ = "0.1.0"
44
+
45
+ __all__ = [
46
+ "Completion",
47
+ "Engine",
48
+ "Gateway",
49
+ "GatewayConfig",
50
+ "ResponseCache",
51
+ "RouteResult",
52
+ "Triage",
53
+ "__version__",
54
+ "cost_usd",
55
+ "price_of",
56
+ "triage",
57
+ ]
switchboard/cache.py ADDED
@@ -0,0 +1,56 @@
1
+ """Exact-match response cache.
2
+
3
+ The cheapest API call is the one you never make. Repeated/identical prompts
4
+ (very common in agent loops and eval harnesses) return instantly at zero cost.
5
+ A semantic cache (embed the prompt, nearest-neighbour over past prompts) is the
6
+ natural next step — the gateway exposes `gemini-embedding-*` and
7
+ `text-embedding-3-*` for exactly this — but exact-match already captures the
8
+ biggest, safest wins without false hits.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import json
15
+ import threading
16
+ import time
17
+ from typing import Any
18
+
19
+
20
+ def _key(messages: list[dict], mode: str) -> str:
21
+ blob = json.dumps({"m": messages, "mode": mode}, sort_keys=True, ensure_ascii=False)
22
+ return hashlib.sha256(blob.encode("utf-8")).hexdigest()
23
+
24
+
25
+ class ResponseCache:
26
+ def __init__(self, max_items: int = 4096, ttl_seconds: float | None = None):
27
+ self._store: dict[str, tuple[float, Any]] = {}
28
+ self._lock = threading.Lock()
29
+ self._max = max_items
30
+ self._ttl = ttl_seconds
31
+ self.hits = 0
32
+ self.misses = 0
33
+
34
+ def get(self, messages: list[dict], mode: str) -> Any | None:
35
+ k = _key(messages, mode)
36
+ with self._lock:
37
+ item = self._store.get(k)
38
+ if item is None:
39
+ self.misses += 1
40
+ return None
41
+ ts, val = item
42
+ if self._ttl is not None and (time.time() - ts) > self._ttl:
43
+ del self._store[k]
44
+ self.misses += 1
45
+ return None
46
+ self.hits += 1
47
+ return val
48
+
49
+ def put(self, messages: list[dict], mode: str, value: Any) -> None:
50
+ k = _key(messages, mode)
51
+ with self._lock:
52
+ if len(self._store) >= self._max and k not in self._store:
53
+ # drop oldest
54
+ oldest = min(self._store, key=lambda x: self._store[x][0])
55
+ del self._store[oldest]
56
+ self._store[k] = (time.time(), value)
@@ -0,0 +1,142 @@
1
+ """Triage: decide how hard a request is, as cheaply as possible.
2
+
3
+ Two signals, combined:
4
+ 1. Heuristics (free, instant): length, code/math markers, multi-step verbs.
5
+ 2. A tiny LLM classifier (gemini flash-lite, ~$0.0001/call) that reads a
6
+ truncated prefix of the prompt and returns difficulty 1-5 + domain.
7
+
8
+ The LLM call is skipped when heuristics are confident (very short trivial
9
+ prompts, or obvious giant code/math prompts), so most requests pay nothing for
10
+ triage. The classifier only earns its keep on the ambiguous middle.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import re
17
+ from dataclasses import dataclass
18
+
19
+ from . import config
20
+ from .gateway import Gateway
21
+
22
+ _CODE_RE = re.compile(r"```|def |class |import |function |SELECT |#include|=>|console\.|public static")
23
+ _MATH_RE = re.compile(r"\b(prove|integral|derivative|theorem|equation|matrix|probability|\d+\s*[+\-*/^]\s*\d+)\b", re.I)
24
+ _HARD_VERBS = re.compile(
25
+ r"\b(design|architect|optimi[sz]e|prove|derive|refactor|debug|analy[sz]e|"
26
+ r"compare|trade-?off|explain why|step by step|plan|strategy|edge cases?)\b",
27
+ re.I,
28
+ )
29
+
30
+ DOMAINS = ("code", "math", "reasoning", "factual", "creative", "chat", "other")
31
+
32
+
33
+ @dataclass
34
+ class Triage:
35
+ difficulty: float # 1.0 (trivial) .. 5.0 (very hard)
36
+ domain: str
37
+ tier: str # "cheap" | "mid" | "hard"
38
+ source: str # "heuristic" | "llm"
39
+ classifier_cost: float = 0.0
40
+ classifier_ms: float = 0.0
41
+ note: str = ""
42
+
43
+
44
+ def _last_user_text(messages: list[dict]) -> str:
45
+ for m in reversed(messages):
46
+ if m.get("role") == "user":
47
+ c = m.get("content")
48
+ if isinstance(c, str):
49
+ return c
50
+ if isinstance(c, list): # OpenAI content-parts form
51
+ return " ".join(p.get("text", "") for p in c if isinstance(p, dict))
52
+ return ""
53
+
54
+
55
+ def _tier_for(difficulty: float) -> str:
56
+ if difficulty <= 2.0:
57
+ return "cheap"
58
+ if difficulty <= 3.4:
59
+ return "mid"
60
+ return "hard"
61
+
62
+
63
+ def _heuristic(text: str) -> tuple[float, str, bool]:
64
+ """Return (difficulty_prior, domain_guess, confident)."""
65
+ n = len(text)
66
+ code = bool(_CODE_RE.search(text))
67
+ math = bool(_MATH_RE.search(text))
68
+ hard = len(_HARD_VERBS.findall(text))
69
+
70
+ domain = "chat"
71
+ if code:
72
+ domain = "code"
73
+ elif math:
74
+ domain = "math"
75
+ elif hard:
76
+ domain = "reasoning"
77
+
78
+ # Very short and no complexity markers -> confidently trivial.
79
+ if n < 80 and not (code or math or hard):
80
+ return 1.5, domain, True
81
+ # Huge prompt with code/math and multiple hard verbs -> confidently hard.
82
+ if (n > 2500 or hard >= 3) and (code or math or hard >= 2):
83
+ return 4.5, domain, True
84
+
85
+ # Otherwise produce a prior but defer to the LLM.
86
+ prior = 2.0
87
+ prior += min(n / 1500.0, 1.5)
88
+ prior += 0.6 * min(hard, 3)
89
+ prior += 0.5 if code else 0.0
90
+ prior += 0.5 if math else 0.0
91
+ return max(1.0, min(prior, 5.0)), domain, False
92
+
93
+
94
+ _CLASSIFIER_SYS = (
95
+ "You are a fast request-difficulty classifier for an LLM router. "
96
+ "Read the user request and rate how powerful a model it needs. "
97
+ "Respond with ONLY a compact JSON object, no prose:\n"
98
+ '{"difficulty": <1-5 int>, "domain": "code|math|reasoning|factual|creative|chat|other"}\n'
99
+ "Scale: 1=trivial (greeting, lookup), 2=easy (short factual, simple rewrite), "
100
+ "3=moderate (normal coding/explanation), 4=hard (multi-step reasoning, non-trivial "
101
+ "code, careful analysis), 5=very hard (research-grade proof, complex system design)."
102
+ )
103
+
104
+
105
+ async def triage(gw: Gateway, messages: list[dict], *, use_llm: bool = True) -> Triage:
106
+ text = _last_user_text(messages)
107
+ prior, domain, confident = _heuristic(text)
108
+
109
+ if confident or not use_llm:
110
+ return Triage(
111
+ prior, domain, _tier_for(prior), "heuristic", note="heuristic-confident" if confident else "llm-disabled"
112
+ )
113
+
114
+ prefix = text[:1800]
115
+ comp = await gw.complete(
116
+ config.CLASSIFIER_MODEL,
117
+ [
118
+ {"role": "system", "content": _CLASSIFIER_SYS},
119
+ {"role": "user", "content": f"Request prefix (len={len(text)} chars):\n{prefix}"},
120
+ ],
121
+ max_tokens=40,
122
+ )
123
+
124
+ difficulty, dom = prior, domain
125
+ if comp.ok:
126
+ try:
127
+ m = re.search(r"\{.*\}", comp.content, re.S)
128
+ obj = json.loads(m.group(0)) if m else {}
129
+ d = float(obj.get("difficulty", prior))
130
+ difficulty = 0.5 * d + 0.5 * prior # blend model judgement with prior
131
+ dom = obj.get("domain", domain) if obj.get("domain") in DOMAINS else domain
132
+ except Exception: # noqa: BLE001
133
+ difficulty = prior
134
+
135
+ return Triage(
136
+ difficulty=difficulty,
137
+ domain=dom,
138
+ tier=_tier_for(difficulty),
139
+ source="llm",
140
+ classifier_cost=comp.cost,
141
+ classifier_ms=comp.latency_ms,
142
+ )
switchboard/cli.py ADDED
@@ -0,0 +1,106 @@
1
+ """Command-line interface for switchboard.
2
+
3
+ switchboard serve [--host H] [--port P] run the OpenAI-compatible server
4
+ switchboard ask "<prompt>" [--mode cost] route one prompt and print telemetry
5
+ switchboard models probe which gateway models are live
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import asyncio
12
+ import sys
13
+
14
+
15
+ def _cmd_serve(args: argparse.Namespace) -> int:
16
+ from switchboard.server import run
17
+
18
+ run(host=args.host, port=args.port)
19
+ return 0
20
+
21
+
22
+ def _cmd_ask(args: argparse.Namespace) -> int:
23
+ from switchboard.engine import Engine
24
+
25
+ async def go() -> int:
26
+ eng = Engine()
27
+ try:
28
+ rr = await eng.answer(
29
+ [{"role": "user", "content": args.prompt}],
30
+ mode=args.mode,
31
+ max_tokens=args.max_tokens,
32
+ )
33
+ finally:
34
+ await eng.aclose()
35
+ print(rr.content)
36
+ print(
37
+ f"\n[route={rr.route} | cost=${rr.cost:.6f} "
38
+ f"| baseline(opus)≈${rr.baseline_cost_est:.6f} "
39
+ f"| savings={rr.savings_pct:.1f}% | {rr.latency_ms:.0f}ms]",
40
+ file=sys.stderr,
41
+ )
42
+ return 0
43
+
44
+ return asyncio.run(go())
45
+
46
+
47
+ def _cmd_models(args: argparse.Namespace) -> int:
48
+ from switchboard import config
49
+ from switchboard.gateway import Gateway
50
+
51
+ async def go() -> int:
52
+ pool = sorted(
53
+ set(
54
+ config.CHEAP
55
+ + config.MID
56
+ + config.STRONG
57
+ + [config.CLASSIFIER_MODEL, config.JUDGE_MODEL, config.DEFAULT_CHEAP, config.DEFAULT_MID]
58
+ + config.MOA_PROPOSERS
59
+ + [config.MOA_SYNTHESIZER]
60
+ + config.BASELINE_MODELS
61
+ )
62
+ )
63
+ gw = Gateway()
64
+
65
+ async def probe(m: str):
66
+ c = await gw.complete(m, [{"role": "user", "content": "Reply with one word: ok"}], max_tokens=300)
67
+ return m, c.ok, "" if c.ok else (c.error or "")[:90]
68
+
69
+ results = await asyncio.gather(*(probe(m) for m in pool))
70
+ await gw.aclose()
71
+ bad = [r for r in results if not r[1]]
72
+ for m, ok, err in results:
73
+ print(f" {'OK ' if ok else 'FAIL'} {m:<26}{'' if ok else ' <- ' + err}")
74
+ print(
75
+ f"\n{len(results) - len(bad)}/{len(results)} live."
76
+ + (f" BROKEN: {[b[0] for b in bad]}" if bad else " all good.")
77
+ )
78
+ return 1 if bad else 0
79
+
80
+ return asyncio.run(go())
81
+
82
+
83
+ def main(argv: list[str] | None = None) -> int:
84
+ parser = argparse.ArgumentParser(prog="switchboard", description="OpenAI-compatible LLM router.")
85
+ sub = parser.add_subparsers(dest="command", required=True)
86
+
87
+ p_serve = sub.add_parser("serve", help="run the OpenAI-compatible server")
88
+ p_serve.add_argument("--host", default="0.0.0.0")
89
+ p_serve.add_argument("--port", type=int, default=8000)
90
+ p_serve.set_defaults(func=_cmd_serve)
91
+
92
+ p_ask = sub.add_parser("ask", help="route one prompt and print the answer + telemetry")
93
+ p_ask.add_argument("prompt")
94
+ p_ask.add_argument("--mode", default="balanced", choices=["balanced", "cost", "quality"])
95
+ p_ask.add_argument("--max-tokens", type=int, default=1024, dest="max_tokens")
96
+ p_ask.set_defaults(func=_cmd_ask)
97
+
98
+ p_models = sub.add_parser("models", help="probe which gateway models are actually live")
99
+ p_models.set_defaults(func=_cmd_models)
100
+
101
+ args = parser.parse_args(argv)
102
+ return args.func(args)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ raise SystemExit(main())
switchboard/config.py ADDED
@@ -0,0 +1,112 @@
1
+ """Model pool, pricing and routing tiers.
2
+
3
+ IMPORTANT: the prices below are *public list-price proxies* in USD per 1M
4
+ tokens. They are roughly correct relative to each other (which is what makes
5
+ the routing decisions sensible), but they are almost certainly NOT what your
6
+ gateway actually bills you. Drop your real rate card into `pricing.json`
7
+ in the repo root and it will override these at load time.
8
+
9
+ The whole point of the router is the *ratios* between tiers: a cheap model is
10
+ ~10-50x cheaper than Opus, a mid model ~3-10x cheaper. As long as those ratios
11
+ hold, the cost story holds.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import json
17
+ import os
18
+ from dataclasses import dataclass
19
+
20
+ # --------------------------------------------------------------------------- #
21
+ # Pricing: USD per 1,000,000 tokens (input, output).
22
+ # --------------------------------------------------------------------------- #
23
+ _DEFAULT_PRICING: dict[str, tuple[float, float]] = {
24
+ # cheap tier
25
+ "gemini-3.1-flash-lite": (0.10, 0.40),
26
+ "gpt-5-nano": (0.05, 0.40),
27
+ "claude-haiku-4-5": (1.00, 5.00),
28
+ # mid tier
29
+ "gemini-3.5-flash": (0.30, 2.50),
30
+ "gpt-5.4-mini": (0.25, 2.00),
31
+ "claude-sonnet-4-6": (3.00, 15.00),
32
+ # strong tier (escalation targets + baselines)
33
+ "claude-opus-4-8": (15.00, 75.00),
34
+ "claude-fable-5": (15.00, 75.00), # placeholder — real price unknown
35
+ "gpt-5.5": (1.25, 10.00),
36
+ "gemini-3.1-pro-preview": (1.25, 10.00),
37
+ }
38
+
39
+
40
+ def _load_pricing() -> dict[str, tuple[float, float]]:
41
+ pricing = dict(_DEFAULT_PRICING)
42
+ path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "pricing.json")
43
+ if os.path.exists(path):
44
+ try:
45
+ with open(path) as f:
46
+ override = json.load(f)
47
+ for k, v in override.items():
48
+ pricing[k] = (float(v[0]), float(v[1]))
49
+ except Exception as e: # noqa: BLE001 - best effort, never crash on bad file
50
+ print(f"[config] warning: could not load pricing.json: {e}")
51
+ return pricing
52
+
53
+
54
+ PRICING = _load_pricing()
55
+
56
+
57
+ def price_of(model: str) -> tuple[float, float]:
58
+ """(input, output) USD per 1M tokens. Falls back to a mid-tier guess."""
59
+ return PRICING.get(model, (1.0, 5.0))
60
+
61
+
62
+ def cost_usd(model: str, prompt_tokens: int, completion_tokens: int) -> float:
63
+ pin, pout = price_of(model)
64
+ return (prompt_tokens / 1e6) * pin + (completion_tokens / 1e6) * pout
65
+
66
+
67
+ # --------------------------------------------------------------------------- #
68
+ # Tiers and pools. Edit these to change routing behaviour.
69
+ # --------------------------------------------------------------------------- #
70
+ # NOTE: the gateway's /v1/models list is stale — it advertises some ids that
71
+ # 404 at call time (e.g. gemini-3-pro-preview, claude-fable-5 -> "use Opus 4.8").
72
+ # Every model below has been verified to actually answer. Run
73
+ # `switchboard models` to re-check after any edit.
74
+ CHEAP = ["gemini-3.1-flash-lite", "gpt-5-nano", "claude-haiku-4-5"]
75
+ MID = ["gemini-3.5-flash", "gpt-5.4-mini", "claude-sonnet-4-6"]
76
+ STRONG = ["claude-opus-4-8", "gpt-5.5", "gemini-3.1-pro-preview"]
77
+
78
+ # Reliable, cheap, low-latency model used for triage + judging. Gemini
79
+ # flash-lite was the most reliable for short structured outputs in testing
80
+ # (gpt-5-nano spent its whole budget on hidden reasoning and returned empty).
81
+ CLASSIFIER_MODEL = "gemini-3.1-flash-lite"
82
+ JUDGE_MODEL = "gemini-3.1-flash-lite"
83
+
84
+ # Default single-model picks per tier.
85
+ DEFAULT_CHEAP = "gemini-3.1-flash-lite"
86
+ DEFAULT_MID = "gemini-3.5-flash"
87
+
88
+ # Mixture-of-Agents (used for the hard tier). Diverse *providers* give
89
+ # diversity without needing temperature (gpt-5 reasoning models reject
90
+ # non-default temperature anyway).
91
+ MOA_PROPOSERS = ["gemini-3.5-flash", "gpt-5.4-mini", "claude-haiku-4-5"]
92
+ MOA_SYNTHESIZER = "claude-sonnet-4-6"
93
+
94
+ # Baselines we benchmark the router against ("always use the big model").
95
+ # claude-fable-5 is advertised by the gateway but 404s ("use Opus 4.8"), so the
96
+ # practical frontier baseline here is Opus 4.8.
97
+ BASELINE_MODELS = ["claude-opus-4-8"]
98
+ PRIMARY_BASELINE = "claude-opus-4-8"
99
+
100
+
101
+ @dataclass
102
+ class GatewayConfig:
103
+ base_url: str
104
+ api_key: str
105
+
106
+ @classmethod
107
+ def from_env(cls) -> GatewayConfig:
108
+ base = os.environ.get("OPENAI_BASE_URL", "").rstrip("/")
109
+ key = os.environ.get("OPENAI_API_KEY", "")
110
+ if not base or not key:
111
+ raise RuntimeError("OPENAI_BASE_URL and OPENAI_API_KEY must be set in the environment.")
112
+ return cls(base_url=base, api_key=key)
switchboard/engine.py ADDED
@@ -0,0 +1,283 @@
1
+ """The router engine: triage -> policy -> execute -> telemetry.
2
+
3
+ Three strategies are composed by the policy:
4
+
5
+ single one model answers (trivial / moderate traffic — the common case).
6
+ moa Mixture-of-Agents: N diverse models propose in parallel, a
7
+ synthesizer fuses them. This is the lever that can *beat* a single
8
+ frontier model on hard queries, at well below frontier cost.
9
+ cascade FrugalGPT-style: answer cheap, a cheap judge scores it, escalate
10
+ only if the score is low. Minimises spend on the easy majority.
11
+
12
+ Every result reports its internal cost and an estimate of what always-Opus
13
+ would have cost, so savings are measured, not asserted.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import asyncio
19
+ import json
20
+ import re
21
+ from dataclasses import dataclass, field
22
+
23
+ from . import config
24
+ from .cache import ResponseCache
25
+ from .classify import Triage, triage
26
+ from .gateway import Completion, Gateway
27
+
28
+
29
+ @dataclass
30
+ class RouteResult:
31
+ content: str
32
+ route: str
33
+ tier: str
34
+ models_used: list[str]
35
+ prompt_tokens: int
36
+ completion_tokens: int
37
+ cost: float
38
+ baseline_cost_est: float
39
+ savings_pct: float
40
+ latency_ms: float
41
+ cached: bool
42
+ triage: Triage
43
+ steps: list[dict] = field(default_factory=list)
44
+
45
+
46
+ _JUDGE_SYS = (
47
+ "You are a strict answer-quality judge. Given a user request and a candidate "
48
+ "answer, rate how fully and correctly the answer satisfies the request. "
49
+ 'Respond ONLY with JSON: {"score": <0.0-1.0>, "reason": "<short>"}. '
50
+ "0.0 = wrong/empty/evasive, 1.0 = complete and correct."
51
+ )
52
+
53
+ _SYNTH_SYS = (
54
+ "You are an expert synthesizer in a Mixture-of-Agents system. You are given a "
55
+ "user request and several candidate answers from different models. The "
56
+ "candidates may be uneven or partly wrong. Critically compare them, discard "
57
+ "errors, and produce a single best answer that is more accurate and complete "
58
+ "than any individual candidate. Do not mention the candidates or the process; "
59
+ "just give the final answer to the user."
60
+ )
61
+
62
+
63
+ def _user_request_text(messages: list[dict]) -> str:
64
+ parts = []
65
+ for m in messages:
66
+ role = m.get("role")
67
+ c = m.get("content")
68
+ if isinstance(c, list):
69
+ c = " ".join(p.get("text", "") for p in c if isinstance(p, dict))
70
+ if role in ("user", "system") and c:
71
+ parts.append(f"{role}: {c}")
72
+ return "\n".join(parts)[:6000]
73
+
74
+
75
+ class Engine:
76
+ def __init__(self, gateway: Gateway | None = None, cache: ResponseCache | None = None):
77
+ self.gw = gateway or Gateway()
78
+ self.cache = cache if cache is not None else ResponseCache()
79
+
80
+ async def aclose(self) -> None:
81
+ await self.gw.aclose()
82
+
83
+ # ---- primitives ------------------------------------------------------- #
84
+ async def _single(self, model: str, messages: list[dict], max_tokens: int) -> Completion:
85
+ return await self.gw.complete(model, messages, max_tokens=max_tokens)
86
+
87
+ async def _judge(self, messages: list[dict], answer: str) -> tuple[float, Completion]:
88
+ req = _user_request_text(messages)
89
+ comp = await self.gw.complete(
90
+ config.JUDGE_MODEL,
91
+ [
92
+ {"role": "system", "content": _JUDGE_SYS},
93
+ {"role": "user", "content": f"REQUEST:\n{req}\n\nANSWER:\n{answer[:4000]}"},
94
+ ],
95
+ max_tokens=60,
96
+ )
97
+ score = 0.5
98
+ if comp.ok:
99
+ try:
100
+ m = re.search(r"\{.*\}", comp.content, re.S)
101
+ score = float(json.loads(m.group(0)).get("score", 0.5)) if m else 0.5
102
+ except Exception: # noqa: BLE001
103
+ score = 0.5
104
+ return max(0.0, min(score, 1.0)), comp
105
+
106
+ async def _moa(
107
+ self, messages: list[dict], proposers: list[str], synthesizer: str, max_tokens: int
108
+ ) -> tuple[Completion, list[Completion]]:
109
+ # Fan out proposers concurrently — this is the parallelism that keeps MoA
110
+ # from being N times slower; wall-clock ≈ slowest proposer + synthesizer.
111
+ proposals = await asyncio.gather(*(self.gw.complete(m, messages, max_tokens=max_tokens) for m in proposers))
112
+ good = [p for p in proposals if p.ok]
113
+ if not good:
114
+ # everything failed; fall back to a single mid model
115
+ fb = await self._single(config.DEFAULT_MID, messages, max_tokens)
116
+ return fb, list(proposals)
117
+
118
+ req = _user_request_text(messages)
119
+ bundle = "\n\n".join(f"--- Candidate {i + 1} (model {p.model}) ---\n{p.content}" for i, p in enumerate(good))
120
+ synth = await self.gw.complete(
121
+ synthesizer,
122
+ [
123
+ {"role": "system", "content": _SYNTH_SYS},
124
+ {"role": "user", "content": f"USER REQUEST:\n{req}\n\nCANDIDATE ANSWERS:\n{bundle}"},
125
+ ],
126
+ max_tokens=max_tokens,
127
+ )
128
+ return synth, list(proposals)
129
+
130
+ # ---- top-level -------------------------------------------------------- #
131
+ async def answer(
132
+ self,
133
+ messages: list[dict],
134
+ *,
135
+ mode: str = "balanced",
136
+ max_tokens: int = 1024,
137
+ use_llm_triage: bool = True,
138
+ use_cache: bool = True,
139
+ ) -> RouteResult:
140
+ loop = asyncio.get_event_loop()
141
+ t_start = loop.time()
142
+
143
+ if use_cache:
144
+ cached = self.cache.get(messages, mode)
145
+ if cached is not None:
146
+ c: RouteResult = cached
147
+ return RouteResult(**{**c.__dict__, "cached": True, "latency_ms": 0.0})
148
+
149
+ tri = await triage(self.gw, messages, use_llm=use_llm_triage)
150
+ steps: list[dict] = [{"stage": "triage", **_triage_dict(tri)}]
151
+
152
+ spent = tri.classifier_cost
153
+ models_used: list[str] = []
154
+
155
+ # ------------------------------------------------------------------ #
156
+ # Policy
157
+ # ------------------------------------------------------------------ #
158
+ if mode == "cost":
159
+ final, extra_cost, used, route = await self._cascade(messages, tri, max_tokens, steps)
160
+ spent += extra_cost
161
+ models_used += used
162
+ else:
163
+ tier = tri.tier
164
+ if mode == "quality": # bias one tier up
165
+ tier = {"cheap": "mid", "mid": "hard", "hard": "hard"}[tier]
166
+
167
+ if tier == "cheap":
168
+ comp = await self._single(config.DEFAULT_CHEAP, messages, max_tokens)
169
+ final, route, used = comp, f"single:{config.DEFAULT_CHEAP}", [config.DEFAULT_CHEAP]
170
+ spent += comp.cost
171
+ steps.append(_step("single", comp))
172
+ elif tier == "mid":
173
+ comp = await self._single(config.DEFAULT_MID, messages, max_tokens)
174
+ final, route, used = comp, f"single:{config.DEFAULT_MID}", [config.DEFAULT_MID]
175
+ spent += comp.cost
176
+ steps.append(_step("single", comp))
177
+ else: # hard -> Mixture-of-Agents
178
+ synth, props = await self._moa(messages, config.MOA_PROPOSERS, config.MOA_SYNTHESIZER, max_tokens)
179
+ used = [p.model for p in props] + [config.MOA_SYNTHESIZER]
180
+ route = f"moa[{'+'.join(config.MOA_PROPOSERS)}]->{config.MOA_SYNTHESIZER}"
181
+ spent += sum(p.cost for p in props) + synth.cost
182
+ for p in props:
183
+ steps.append(_step("moa-proposer", p))
184
+ steps.append(_step("moa-synth", synth))
185
+ final = synth
186
+ models_used += used
187
+
188
+ # ------------------------------------------------------------------ #
189
+ # Telemetry + baseline comparison
190
+ # ------------------------------------------------------------------ #
191
+ rep_pt = final.prompt_tokens or _est_tokens(messages)
192
+ rep_ct = final.completion_tokens or _est_tokens_text(final.content)
193
+ baseline = config.cost_usd(config.PRIMARY_BASELINE, rep_pt, rep_ct)
194
+ savings = (1 - spent / baseline) * 100 if baseline > 0 else 0.0
195
+
196
+ result = RouteResult(
197
+ content=final.content,
198
+ route=route,
199
+ tier=tri.tier,
200
+ models_used=models_used,
201
+ prompt_tokens=rep_pt,
202
+ completion_tokens=rep_ct,
203
+ cost=spent,
204
+ baseline_cost_est=baseline,
205
+ savings_pct=savings,
206
+ latency_ms=(loop.time() - t_start) * 1000,
207
+ cached=False,
208
+ triage=tri,
209
+ steps=steps,
210
+ )
211
+ if use_cache and final.ok:
212
+ self.cache.put(messages, mode, result)
213
+ return result
214
+
215
+ async def _cascade(
216
+ self, messages: list[dict], tri: Triage, max_tokens: int, steps: list[dict]
217
+ ) -> tuple[Completion, float, list[str], str]:
218
+ """FrugalGPT cascade: cheap -> judge -> mid -> judge -> MoA."""
219
+ spent = 0.0
220
+ used: list[str] = []
221
+
222
+ c1 = await self._single(config.DEFAULT_CHEAP, messages, max_tokens)
223
+ spent += c1.cost
224
+ used.append(config.DEFAULT_CHEAP)
225
+ steps.append(_step("cascade-cheap", c1))
226
+ s1, j1 = await self._judge(messages, c1.content)
227
+ spent += j1.cost
228
+ steps.append({"stage": "cascade-judge", "score": round(s1, 3), "cost": j1.cost})
229
+ if c1.ok and s1 >= 0.70:
230
+ return c1, spent, used, f"cascade:single:{config.DEFAULT_CHEAP}(score={s1:.2f})"
231
+
232
+ c2 = await self._single(config.DEFAULT_MID, messages, max_tokens)
233
+ spent += c2.cost
234
+ used.append(config.DEFAULT_MID)
235
+ steps.append(_step("cascade-mid", c2))
236
+ s2, j2 = await self._judge(messages, c2.content)
237
+ spent += j2.cost
238
+ steps.append({"stage": "cascade-judge", "score": round(s2, 3), "cost": j2.cost})
239
+ if c2.ok and s2 >= 0.65:
240
+ return c2, spent, used, f"cascade:single:{config.DEFAULT_MID}(score={s2:.2f})"
241
+
242
+ synth, props = await self._moa(messages, config.MOA_PROPOSERS, config.MOA_SYNTHESIZER, max_tokens)
243
+ spent += sum(p.cost for p in props) + synth.cost
244
+ used += [p.model for p in props] + [config.MOA_SYNTHESIZER]
245
+ for p in props:
246
+ steps.append(_step("cascade-moa-proposer", p))
247
+ steps.append(_step("cascade-moa-synth", synth))
248
+ return synth, spent, used, f"cascade:moa->{config.MOA_SYNTHESIZER}"
249
+
250
+
251
+ # --------------------------------------------------------------------------- #
252
+ # helpers
253
+ # --------------------------------------------------------------------------- #
254
+ def _est_tokens(messages: list[dict]) -> int:
255
+ txt = _user_request_text(messages)
256
+ return max(1, len(txt) // 4)
257
+
258
+
259
+ def _est_tokens_text(text: str) -> int:
260
+ return max(1, len(text) // 4)
261
+
262
+
263
+ def _triage_dict(t: Triage) -> dict:
264
+ return {
265
+ "difficulty": round(t.difficulty, 2),
266
+ "domain": t.domain,
267
+ "tier": t.tier,
268
+ "source": t.source,
269
+ "cost": round(t.classifier_cost, 8),
270
+ }
271
+
272
+
273
+ def _step(stage: str, c: Completion) -> dict:
274
+ return {
275
+ "stage": stage,
276
+ "model": c.model,
277
+ "ok": c.ok,
278
+ "prompt_tokens": c.prompt_tokens,
279
+ "completion_tokens": c.completion_tokens,
280
+ "cost": c.cost,
281
+ "latency_ms": round(c.latency_ms, 1),
282
+ "error": c.error,
283
+ }
switchboard/gateway.py ADDED
@@ -0,0 +1,124 @@
1
+ """Async client for the OpenAI-compatible gateway.
2
+
3
+ One client addresses every model — OpenAI, Anthropic and Google — by just
4
+ swapping the `model` field. This is what makes the router simple: it is just a
5
+ policy on top of a single `complete()` call.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from dataclasses import dataclass, field
12
+
13
+ import httpx
14
+
15
+ from . import config
16
+
17
+
18
+ @dataclass
19
+ class Completion:
20
+ model: str
21
+ content: str
22
+ prompt_tokens: int
23
+ completion_tokens: int
24
+ cost: float
25
+ latency_ms: float
26
+ finish_reason: str = ""
27
+ error: str | None = None
28
+ raw: dict = field(default_factory=dict, repr=False)
29
+
30
+ @property
31
+ def ok(self) -> bool:
32
+ return self.error is None and bool(self.content.strip())
33
+
34
+
35
+ # gpt-5* reasoning models bill hidden reasoning tokens against the completion
36
+ # budget and return empty content if the budget is too small. We give them
37
+ # headroom and retry once with a bigger budget if they come back empty.
38
+ def _is_reasoning_model(model: str) -> bool:
39
+ return model.startswith("gpt-5") or model.startswith("o3") or model.startswith("o4")
40
+
41
+
42
+ class Gateway:
43
+ def __init__(self, cfg: config.GatewayConfig | None = None, *, concurrency: int = 24):
44
+ self.cfg = cfg or config.GatewayConfig.from_env()
45
+ self._client = httpx.AsyncClient(
46
+ base_url=self.cfg.base_url,
47
+ headers={
48
+ "Authorization": f"Bearer {self.cfg.api_key}",
49
+ "Content-Type": "application/json",
50
+ },
51
+ timeout=httpx.Timeout(120.0, connect=10.0),
52
+ limits=httpx.Limits(max_connections=concurrency),
53
+ )
54
+
55
+ async def aclose(self) -> None:
56
+ await self._client.aclose()
57
+
58
+ async def complete(
59
+ self,
60
+ model: str,
61
+ messages: list[dict],
62
+ *,
63
+ max_tokens: int = 1024,
64
+ temperature: float | None = None,
65
+ retries: int = 2,
66
+ ) -> Completion:
67
+ # gpt-5 reasoning models need extra headroom; everyone gets at least the ask.
68
+ budget = max_tokens
69
+ if _is_reasoning_model(model):
70
+ budget = max(max_tokens, 1024) + 1024 # room for hidden reasoning
71
+
72
+ last_err = "unknown error"
73
+ for attempt in range(retries + 1):
74
+ payload: dict = {
75
+ "model": model,
76
+ "messages": messages,
77
+ "max_completion_tokens": budget,
78
+ }
79
+ # gpt-5 reasoning models reject non-default temperature.
80
+ if temperature is not None and not _is_reasoning_model(model):
81
+ payload["temperature"] = temperature
82
+
83
+ t0 = time.perf_counter()
84
+ try:
85
+ r = await self._client.post("/chat/completions", json=payload)
86
+ dt = (time.perf_counter() - t0) * 1000
87
+ if r.status_code != 200:
88
+ last_err = f"HTTP {r.status_code}: {r.text[:200]}"
89
+ # 4xx other than rate limit won't fix on retry
90
+ if r.status_code not in (408, 409, 425, 429, 500, 502, 503, 504):
91
+ return Completion(model, "", 0, 0, 0.0, dt, error=last_err)
92
+ continue
93
+ data = r.json()
94
+ except Exception as e: # noqa: BLE001
95
+ dt = (time.perf_counter() - t0) * 1000
96
+ last_err = f"{type(e).__name__}: {e}"
97
+ continue
98
+
99
+ choice = (data.get("choices") or [{}])[0]
100
+ content = (choice.get("message") or {}).get("content") or ""
101
+ finish = choice.get("finish_reason") or ""
102
+ usage = data.get("usage") or {}
103
+ pt = int(usage.get("prompt_tokens", 0))
104
+ ct = int(usage.get("completion_tokens", 0))
105
+ cost = config.cost_usd(model, pt, ct)
106
+
107
+ # Empty content because the reasoning model ran out of budget — retry bigger.
108
+ if not content.strip() and finish == "length" and attempt < retries:
109
+ budget *= 2
110
+ last_err = "empty content (finish_reason=length)"
111
+ continue
112
+
113
+ return Completion(
114
+ model=model,
115
+ content=content,
116
+ prompt_tokens=pt,
117
+ completion_tokens=ct,
118
+ cost=cost,
119
+ latency_ms=dt,
120
+ finish_reason=finish,
121
+ raw=data,
122
+ )
123
+
124
+ return Completion(model, "", 0, 0, 0.0, 0.0, error=last_err)
switchboard/py.typed ADDED
File without changes
switchboard/server.py ADDED
@@ -0,0 +1,190 @@
1
+ """OpenAI-compatible HTTP server in front of the router.
2
+
3
+ Point any OpenAI client at it and it Just Works::
4
+
5
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
6
+ client.chat.completions.create(model="router", messages=[...])
7
+
8
+ Virtual models:
9
+ router / router-balanced triage -> single (easy) or Mixture-of-Agents (hard)
10
+ router-cost FrugalGPT cascade (cheapest; escalate on low score)
11
+ router-quality bias one tier up (best quality under Opus cost)
12
+
13
+ Any *real* model id (e.g. "claude-opus-4-8", "gemini-3-pro-preview") is passed
14
+ straight through to the gateway, so this also works as a plain proxy.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import time
21
+ import uuid
22
+ from contextlib import asynccontextmanager
23
+
24
+ from fastapi import FastAPI, Request
25
+ from fastapi.responses import JSONResponse, StreamingResponse
26
+
27
+ from switchboard.engine import Engine
28
+
29
+ MODE_BY_MODEL = {
30
+ "router": "balanced",
31
+ "router-balanced": "balanced",
32
+ "router-cost": "cost",
33
+ "router-quality": "quality",
34
+ }
35
+
36
+ # Key under which router telemetry is attached to each response (ignored by
37
+ # standard OpenAI clients).
38
+ META_KEY = "switchboard"
39
+
40
+
41
+ @asynccontextmanager
42
+ async def _lifespan(app: FastAPI):
43
+ app.state.engine = Engine()
44
+ try:
45
+ yield
46
+ finally:
47
+ await app.state.engine.aclose()
48
+
49
+
50
+ def create_app() -> FastAPI:
51
+ app = FastAPI(title="switchboard", version="0.1.0", lifespan=_lifespan)
52
+
53
+ @app.get("/healthz")
54
+ async def healthz() -> dict:
55
+ return {"ok": True}
56
+
57
+ @app.get("/v1/models")
58
+ async def models() -> JSONResponse:
59
+ virtual = [{"id": m, "object": "model", "owned_by": "switchboard"} for m in MODE_BY_MODEL]
60
+ return JSONResponse({"object": "list", "data": virtual})
61
+
62
+ @app.post("/v1/chat/completions")
63
+ async def chat_completions(request: Request):
64
+ engine: Engine = request.app.state.engine
65
+ body = await request.json()
66
+ model = body.get("model", "router")
67
+ messages = body.get("messages", [])
68
+ stream = bool(body.get("stream", False))
69
+ max_tokens = int(body.get("max_completion_tokens") or body.get("max_tokens") or 1024)
70
+
71
+ # Real model id -> transparent proxy.
72
+ if model not in MODE_BY_MODEL:
73
+ comp = await engine.gw.complete(model, messages, max_tokens=max_tokens, temperature=body.get("temperature"))
74
+ if not comp.ok and comp.error:
75
+ return JSONResponse({"error": {"message": comp.error}}, status_code=502)
76
+ shim = _ProxyResult(comp, model)
77
+ if stream:
78
+ return StreamingResponse(_sse_chunks(shim, model), media_type="text/event-stream")
79
+ return JSONResponse(_openai_response(shim, model))
80
+
81
+ rr = await engine.answer(messages, mode=MODE_BY_MODEL[model], max_tokens=max_tokens)
82
+ if stream:
83
+ return StreamingResponse(_sse_chunks(rr, model), media_type="text/event-stream")
84
+ return JSONResponse(_openai_response(rr, model))
85
+
86
+ return app
87
+
88
+
89
+ class _ProxyResult:
90
+ """Adapts a raw Completion to the RouteResult-ish shape the formatter expects."""
91
+
92
+ def __init__(self, comp, model: str):
93
+ self.content = comp.content
94
+ self.route = f"proxy:{model}"
95
+ self.tier = "n/a"
96
+ self.models_used = [model]
97
+ self.prompt_tokens = comp.prompt_tokens
98
+ self.completion_tokens = comp.completion_tokens
99
+ self.cost = comp.cost
100
+ self.baseline_cost_est = comp.cost
101
+ self.savings_pct = 0.0
102
+ self.latency_ms = comp.latency_ms
103
+ self.cached = False
104
+ self.steps: list = []
105
+
106
+
107
+ def _openai_response(rr, model_label: str) -> dict:
108
+ return {
109
+ "id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
110
+ "object": "chat.completion",
111
+ "created": int(time.time()),
112
+ "model": model_label,
113
+ "choices": [
114
+ {
115
+ "index": 0,
116
+ "message": {"role": "assistant", "content": rr.content},
117
+ "finish_reason": "stop",
118
+ }
119
+ ],
120
+ "usage": {
121
+ "prompt_tokens": rr.prompt_tokens,
122
+ "completion_tokens": rr.completion_tokens,
123
+ "total_tokens": rr.prompt_tokens + rr.completion_tokens,
124
+ },
125
+ META_KEY: {
126
+ "route": rr.route,
127
+ "tier": rr.tier,
128
+ "models_used": rr.models_used,
129
+ "cost_usd": round(rr.cost, 8),
130
+ "baseline_opus_cost_usd": round(rr.baseline_cost_est, 8),
131
+ "savings_pct": round(rr.savings_pct, 1),
132
+ "latency_ms": round(rr.latency_ms, 1),
133
+ "cached": rr.cached,
134
+ "steps": rr.steps,
135
+ },
136
+ }
137
+
138
+
139
+ def _sse_chunks(rr, model_label: str):
140
+ cid = f"chatcmpl-{uuid.uuid4().hex[:24]}"
141
+ created = int(time.time())
142
+
143
+ def frame(delta: dict, finish=None) -> str:
144
+ return (
145
+ "data: "
146
+ + json.dumps(
147
+ {
148
+ "id": cid,
149
+ "object": "chat.completion.chunk",
150
+ "created": created,
151
+ "model": model_label,
152
+ "choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
153
+ }
154
+ )
155
+ + "\n\n"
156
+ )
157
+
158
+ yield frame({"role": "assistant"})
159
+ # We compute the full answer first (MoA can't truly stream), then chunk it
160
+ # so streaming clients still work.
161
+ text = rr.content
162
+ step = max(1, len(text) // 40)
163
+ for i in range(0, len(text), step):
164
+ yield frame({"content": text[i : i + step]})
165
+ yield frame({}, finish="stop")
166
+ yield (
167
+ "data: "
168
+ + json.dumps(
169
+ {
170
+ META_KEY: {
171
+ "route": rr.route,
172
+ "cost_usd": round(rr.cost, 8),
173
+ "baseline_opus_cost_usd": round(rr.baseline_cost_est, 8),
174
+ "savings_pct": round(rr.savings_pct, 1),
175
+ }
176
+ }
177
+ )
178
+ + "\n\n"
179
+ )
180
+ yield "data: [DONE]\n\n"
181
+
182
+
183
+ # Module-level app for `uvicorn switchboard.server:app`.
184
+ app = create_app()
185
+
186
+
187
+ def run(host: str = "0.0.0.0", port: int = 8000) -> None:
188
+ import uvicorn
189
+
190
+ uvicorn.run(app, host=host, port=port)
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: switchboard-llm
3
+ Version: 0.1.0
4
+ Summary: An OpenAI-compatible LLM router that saves cost without losing quality.
5
+ Project-URL: Homepage, https://github.com/archit0/switchboard
6
+ Project-URL: Repository, https://github.com/archit0/switchboard
7
+ Project-URL: Issues, https://github.com/archit0/switchboard/issues
8
+ Author: Archit Dwivedi
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: anthropic,cost,frugalgpt,gateway,gemini,llm,mixture-of-agents,openai,router
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: fastapi>=0.110
21
+ Requires-Dist: httpx>=0.27
22
+ Requires-Dist: uvicorn[standard]>=0.29
23
+ Description-Content-Type: text/markdown
24
+
25
+ # switchboard
26
+
27
+ [![CI](https://github.com/archit0/switchboard/actions/workflows/ci.yml/badge.svg)](https://github.com/archit0/switchboard/actions/workflows/ci.yml)
28
+ [![PyPI](https://img.shields.io/pypi/v/switchboard-llm.svg)](https://pypi.org/project/switchboard-llm/)
29
+
30
+ An **OpenAI-compatible LLM router** that saves cost without losing quality. Point
31
+ any OpenAI client at it and it routes each request to the cheapest model that can
32
+ handle it — easy prompts to a small model, hard ones to a parallel
33
+ **Mixture-of-Agents** — trading a little latency for large savings while holding
34
+ (or beating) frontier-model quality on a representative workload.
35
+
36
+ ```python
37
+ from openai import OpenAI
38
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
39
+ client.chat.completions.create(model="router-cost", messages=[{"role": "user", "content": "..."}])
40
+ ```
41
+
42
+ It works on top of **any OpenAI-compatible gateway that fronts multiple providers**
43
+ behind one key (e.g. a LiteLLM proxy) — so one client can reach OpenAI, Anthropic,
44
+ and Google models just by changing the `model` field. The router is a thin policy
45
+ on top of that.
46
+
47
+ ---
48
+
49
+ ## Install
50
+
51
+ ```bash
52
+ pip install switchboard-llm # or: uv add switchboard-llm
53
+ ```
54
+
55
+ Configure your gateway (any OpenAI-compatible endpoint):
56
+
57
+ ```bash
58
+ export OPENAI_API_KEY=... # your gateway key
59
+ export OPENAI_BASE_URL=https://.../v1 # your endpoint
60
+ ```
61
+
62
+ ## Use it
63
+
64
+ **As a server** (drop-in for any OpenAI client):
65
+
66
+ ```bash
67
+ switchboard serve # http://localhost:8000/v1 (use --port to change)
68
+ ```
69
+ ```python
70
+ from openai import OpenAI
71
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
72
+ r = client.chat.completions.create(model="router", messages=[{"role": "user", "content": "Hi"}])
73
+ print(r.model_extra["switchboard"]) # route, cost, savings telemetry
74
+ ```
75
+
76
+ **As a library:**
77
+
78
+ ```python
79
+ import asyncio
80
+ from switchboard import Engine
81
+
82
+ async def main():
83
+ eng = Engine()
84
+ rr = await eng.answer([{"role": "user", "content": "What is 17 * 23?"}], mode="cost")
85
+ print(rr.content, f"${rr.cost:.6f}", f"{rr.savings_pct:.0f}% cheaper than Opus")
86
+ await eng.aclose()
87
+
88
+ asyncio.run(main())
89
+ ```
90
+
91
+ **From the CLI:**
92
+
93
+ ```bash
94
+ switchboard ask "Prove sqrt(2) is irrational" --mode quality
95
+ switchboard models # probe which gateway models are actually live
96
+ ```
97
+
98
+ ---
99
+
100
+ ## The honest thesis (read this first)
101
+
102
+ The goal is a router that is **cheaper than a frontier model (e.g. Opus) and
103
+ matches-or-beats it on benchmarks**. That is achievable — but only as a
104
+ **portfolio result over a realistic workload**, not a per-query miracle. The iron
105
+ law:
106
+
107
+ > On a *single hard query*, you cannot both beat the frontier model **and** be
108
+ > cheaper than it on that same query.
109
+
110
+ What you *can* do, and what this does:
111
+
112
+ | Traffic | What the router does | Outcome |
113
+ |---|---|---|
114
+ | **Easy queries** (most real traffic) | route to a cheap model | quality ties Opus, **5–50× cheaper** |
115
+ | **Hard queries** (the minority) | **Mixture-of-Agents**: several cheap/mid models answer in parallel, a synthesizer fuses them | quality can **match or exceed** a single Opus call, still **< Opus cost** |
116
+ | **Repeats** | exact-match cache | **free** |
117
+
118
+ Averaged over the workload, total spend is well below always-Opus and mean
119
+ accuracy is **equal-or-better**. Grounded in **RouteLLM**, **FrugalGPT** (cascade
120
+ with a judge), and **Mixture-of-Agents**.
121
+
122
+ ---
123
+
124
+ ## Modes
125
+
126
+ Pick the strategy via the `model` field:
127
+
128
+ | `model` | strategy |
129
+ |---|---|
130
+ | `router` / `router-balanced` | triage → single cheap (easy) / single mid (moderate) / Mixture-of-Agents (hard) |
131
+ | `router-cost` | **FrugalGPT cascade** — answer cheap, a judge scores it, escalate only if low |
132
+ | `router-quality` | bias one tier up — best quality while staying under Opus cost |
133
+
134
+ Any **real** model id (`claude-opus-4-8`, `gpt-5.5`, …) passes straight through, so
135
+ this also works as a plain multi-provider proxy.
136
+
137
+ ## How it works
138
+
139
+ ```
140
+ request ─► [cache] ─► [triage: how hard?] ─► [policy] ──► single cheap model (easy)
141
+ └─► single mid model (moderate)
142
+ └─► Mixture-of-Agents (hard)
143
+ proposers ∥ ─► synthesizer
144
+ ```
145
+
146
+ - **Triage** (`src/switchboard/classify.py`) — free heuristics (length, code/math
147
+ markers, multi-step verbs) decide obvious cases; a tiny LLM classifier scores the
148
+ ambiguous middle. Output: difficulty 1–5 → tier.
149
+ - **Policy / execution** (`src/switchboard/engine.py`) — `single`, `moa` (parallel
150
+ proposers + synthesizer), or `cascade` (cheap → judge → escalate).
151
+ - **Cost accounting** — every response carries its internal cost, an estimate of
152
+ what always-Opus would have cost, and the savings %, under a `switchboard` key.
153
+
154
+ ---
155
+
156
+ ## Results
157
+
158
+ On **GSM8K (50 items, exact numeric grading)**, baseline = always `claude-opus-4-8`:
159
+
160
+ | config | accuracy | total cost | vs Opus |
161
+ |---|---|---|---|
162
+ | always-Opus | 100.0% | $0.3674 | baseline |
163
+ | `router-cost` | **100.0%** | $0.0064 | **57× cheaper — Pareto win** |
164
+ | `router-quality` | 100.0% | $0.2781 | 1.3× cheaper |
165
+ | `router-balanced` | 92.0% | $0.0611 | 6× cheaper but lost accuracy |
166
+
167
+ Reproduce: `python -m bench.run_gsm8k --n 50 --seed 0`. Full write-up and honest
168
+ caveats in [`RESULTS.md`](RESULTS.md). (The verifier is what makes routing safe —
169
+ `router-balanced` has none and lost 8 points; `router-cost`'s judge is the fix.)
170
+
171
+ ---
172
+
173
+ ## Limitations & next steps
174
+
175
+ - **Pricing is a list-price proxy** (`src/switchboard/config.py`). Drop your real
176
+ rate card into `pricing.json` (`{"model": [in_per_1M, out_per_1M]}`) to override.
177
+ - **Triage under-detects "deceptively simple" trap questions** — `router-cost`/
178
+ `router-quality` compensate via the judge/MoA.
179
+ - **Streaming is simulated** (full answer computed, then chunked) — MoA can't
180
+ token-stream; only the single-model path could truly stream.
181
+ - **Semantic cache** (embed prompt → nearest neighbour) is not yet wired.
182
+ - **The gateway's `/v1/models` list may be stale** — trust `switchboard models`.
183
+
184
+ ## License
185
+
186
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,14 @@
1
+ switchboard/__init__.py,sha256=fMa5XVm2tqaCzHbI2_R4Xyuj_OFOoD-qi-BgvVcTvk8,1761
2
+ switchboard/cache.py,sha256=5AOK4Vv_qsAwCJ8DSzsrO3kNW-W55DYnH8Q2NyEsVq8,1976
3
+ switchboard/classify.py,sha256=5iMqEFEXaOmmqtZXwh8OZcPkqFzz6sOXSGx58SGY3ac,4883
4
+ switchboard/cli.py,sha256=p7owaGcmueY9yJdnheeiFh93yxVK8gjy8g-zBGt9UFs,3526
5
+ switchboard/config.py,sha256=kqZ-odX7lMWldcp1nc9NAmulREfmqBInMrUB0COtPrY,4509
6
+ switchboard/engine.py,sha256=tmyyrhyMSOQKVcyVqk1XusI_suDEXutS5OTnRQpTo58,11208
7
+ switchboard/gateway.py,sha256=7EmH5drldQ0DSG_SInZdULUg1GF1NeBtuNPHL3WycHg,4442
8
+ switchboard/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ switchboard/server.py,sha256=C9PPSwomrphRpL5AXxgQNtr_CWbZ2I_Fhfq3YSk4Gwk,6210
10
+ switchboard_llm-0.1.0.dist-info/METADATA,sha256=nm2oqTFM27pFwf0MckcbuzDVvtxuHs4zUFkqrZ8qxpI,7307
11
+ switchboard_llm-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
12
+ switchboard_llm-0.1.0.dist-info/entry_points.txt,sha256=r2X0ZfWK4hPbr1jjFGW-ODJN8-unQVhix3OTC0O3XUA,53
13
+ switchboard_llm-0.1.0.dist-info/licenses/LICENSE,sha256=tys42fCgdQASPWxptC0DasjfZgUdVxXko8tGZ6K4ZOE,1071
14
+ switchboard_llm-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ switchboard = switchboard.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Archit Dwivedi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.