switchboard-llm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- switchboard/__init__.py +57 -0
- switchboard/cache.py +56 -0
- switchboard/classify.py +142 -0
- switchboard/cli.py +106 -0
- switchboard/config.py +112 -0
- switchboard/engine.py +283 -0
- switchboard/gateway.py +124 -0
- switchboard/py.typed +0 -0
- switchboard/server.py +190 -0
- switchboard_llm-0.1.0.dist-info/METADATA +186 -0
- switchboard_llm-0.1.0.dist-info/RECORD +14 -0
- switchboard_llm-0.1.0.dist-info/WHEEL +4 -0
- switchboard_llm-0.1.0.dist-info/entry_points.txt +2 -0
- switchboard_llm-0.1.0.dist-info/licenses/LICENSE +21 -0
switchboard/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""switchboard — an OpenAI-compatible LLM router that saves cost without losing quality.
|
|
2
|
+
|
|
3
|
+
Point any OpenAI client at the switchboard server and it routes each request to
|
|
4
|
+
the cheapest model that can handle it — easy prompts to a small model, hard ones
|
|
5
|
+
to a parallel Mixture-of-Agents — trading a little latency for large savings while
|
|
6
|
+
holding (or beating) frontier-model quality on a representative workload.
|
|
7
|
+
|
|
8
|
+
Quickstart (library)::
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
from switchboard import Engine
|
|
12
|
+
|
|
13
|
+
async def main():
|
|
14
|
+
eng = Engine()
|
|
15
|
+
result = await eng.answer(
|
|
16
|
+
[{"role": "user", "content": "What is 17 * 23?"}],
|
|
17
|
+
mode="cost",
|
|
18
|
+
)
|
|
19
|
+
print(result.content, result.cost, result.savings_pct)
|
|
20
|
+
await eng.aclose()
|
|
21
|
+
|
|
22
|
+
asyncio.run(main())
|
|
23
|
+
|
|
24
|
+
Quickstart (OpenAI-compatible server)::
|
|
25
|
+
|
|
26
|
+
$ switchboard serve # http://localhost:8000/v1
|
|
27
|
+
|
|
28
|
+
from openai import OpenAI
|
|
29
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
30
|
+
client.chat.completions.create(model="router-cost", messages=[...])
|
|
31
|
+
|
|
32
|
+
The gateway is configured via the ``OPENAI_BASE_URL`` and ``OPENAI_API_KEY``
|
|
33
|
+
environment variables (any OpenAI-compatible endpoint that fronts multiple
|
|
34
|
+
providers — e.g. a LiteLLM proxy — works).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
from switchboard.cache import ResponseCache
|
|
38
|
+
from switchboard.classify import Triage, triage
|
|
39
|
+
from switchboard.config import GatewayConfig, cost_usd, price_of
|
|
40
|
+
from switchboard.engine import Engine, RouteResult
|
|
41
|
+
from switchboard.gateway import Completion, Gateway
|
|
42
|
+
|
|
43
|
+
__version__ = "0.1.0"
|
|
44
|
+
|
|
45
|
+
__all__ = [
|
|
46
|
+
"Completion",
|
|
47
|
+
"Engine",
|
|
48
|
+
"Gateway",
|
|
49
|
+
"GatewayConfig",
|
|
50
|
+
"ResponseCache",
|
|
51
|
+
"RouteResult",
|
|
52
|
+
"Triage",
|
|
53
|
+
"__version__",
|
|
54
|
+
"cost_usd",
|
|
55
|
+
"price_of",
|
|
56
|
+
"triage",
|
|
57
|
+
]
|
switchboard/cache.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Exact-match response cache.
|
|
2
|
+
|
|
3
|
+
The cheapest API call is the one you never make. Repeated/identical prompts
|
|
4
|
+
(very common in agent loops and eval harnesses) return instantly at zero cost.
|
|
5
|
+
A semantic cache (embed the prompt, nearest-neighbour over past prompts) is the
|
|
6
|
+
natural next step — the gateway exposes `gemini-embedding-*` and
|
|
7
|
+
`text-embedding-3-*` for exactly this — but exact-match already captures the
|
|
8
|
+
biggest, safest wins without false hits.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _key(messages: list[dict], mode: str) -> str:
|
|
21
|
+
blob = json.dumps({"m": messages, "mode": mode}, sort_keys=True, ensure_ascii=False)
|
|
22
|
+
return hashlib.sha256(blob.encode("utf-8")).hexdigest()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ResponseCache:
|
|
26
|
+
def __init__(self, max_items: int = 4096, ttl_seconds: float | None = None):
|
|
27
|
+
self._store: dict[str, tuple[float, Any]] = {}
|
|
28
|
+
self._lock = threading.Lock()
|
|
29
|
+
self._max = max_items
|
|
30
|
+
self._ttl = ttl_seconds
|
|
31
|
+
self.hits = 0
|
|
32
|
+
self.misses = 0
|
|
33
|
+
|
|
34
|
+
def get(self, messages: list[dict], mode: str) -> Any | None:
|
|
35
|
+
k = _key(messages, mode)
|
|
36
|
+
with self._lock:
|
|
37
|
+
item = self._store.get(k)
|
|
38
|
+
if item is None:
|
|
39
|
+
self.misses += 1
|
|
40
|
+
return None
|
|
41
|
+
ts, val = item
|
|
42
|
+
if self._ttl is not None and (time.time() - ts) > self._ttl:
|
|
43
|
+
del self._store[k]
|
|
44
|
+
self.misses += 1
|
|
45
|
+
return None
|
|
46
|
+
self.hits += 1
|
|
47
|
+
return val
|
|
48
|
+
|
|
49
|
+
def put(self, messages: list[dict], mode: str, value: Any) -> None:
|
|
50
|
+
k = _key(messages, mode)
|
|
51
|
+
with self._lock:
|
|
52
|
+
if len(self._store) >= self._max and k not in self._store:
|
|
53
|
+
# drop oldest
|
|
54
|
+
oldest = min(self._store, key=lambda x: self._store[x][0])
|
|
55
|
+
del self._store[oldest]
|
|
56
|
+
self._store[k] = (time.time(), value)
|
switchboard/classify.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Triage: decide how hard a request is, as cheaply as possible.
|
|
2
|
+
|
|
3
|
+
Two signals, combined:
|
|
4
|
+
1. Heuristics (free, instant): length, code/math markers, multi-step verbs.
|
|
5
|
+
2. A tiny LLM classifier (gemini flash-lite, ~$0.0001/call) that reads a
|
|
6
|
+
truncated prefix of the prompt and returns difficulty 1-5 + domain.
|
|
7
|
+
|
|
8
|
+
The LLM call is skipped when heuristics are confident (very short trivial
|
|
9
|
+
prompts, or obvious giant code/math prompts), so most requests pay nothing for
|
|
10
|
+
triage. The classifier only earns its keep on the ambiguous middle.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import re
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from . import config
|
|
20
|
+
from .gateway import Gateway
|
|
21
|
+
|
|
22
|
+
_CODE_RE = re.compile(r"```|def |class |import |function |SELECT |#include|=>|console\.|public static")
|
|
23
|
+
_MATH_RE = re.compile(r"\b(prove|integral|derivative|theorem|equation|matrix|probability|\d+\s*[+\-*/^]\s*\d+)\b", re.I)
|
|
24
|
+
_HARD_VERBS = re.compile(
|
|
25
|
+
r"\b(design|architect|optimi[sz]e|prove|derive|refactor|debug|analy[sz]e|"
|
|
26
|
+
r"compare|trade-?off|explain why|step by step|plan|strategy|edge cases?)\b",
|
|
27
|
+
re.I,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
DOMAINS = ("code", "math", "reasoning", "factual", "creative", "chat", "other")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class Triage:
|
|
35
|
+
difficulty: float # 1.0 (trivial) .. 5.0 (very hard)
|
|
36
|
+
domain: str
|
|
37
|
+
tier: str # "cheap" | "mid" | "hard"
|
|
38
|
+
source: str # "heuristic" | "llm"
|
|
39
|
+
classifier_cost: float = 0.0
|
|
40
|
+
classifier_ms: float = 0.0
|
|
41
|
+
note: str = ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _last_user_text(messages: list[dict]) -> str:
|
|
45
|
+
for m in reversed(messages):
|
|
46
|
+
if m.get("role") == "user":
|
|
47
|
+
c = m.get("content")
|
|
48
|
+
if isinstance(c, str):
|
|
49
|
+
return c
|
|
50
|
+
if isinstance(c, list): # OpenAI content-parts form
|
|
51
|
+
return " ".join(p.get("text", "") for p in c if isinstance(p, dict))
|
|
52
|
+
return ""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _tier_for(difficulty: float) -> str:
|
|
56
|
+
if difficulty <= 2.0:
|
|
57
|
+
return "cheap"
|
|
58
|
+
if difficulty <= 3.4:
|
|
59
|
+
return "mid"
|
|
60
|
+
return "hard"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _heuristic(text: str) -> tuple[float, str, bool]:
|
|
64
|
+
"""Return (difficulty_prior, domain_guess, confident)."""
|
|
65
|
+
n = len(text)
|
|
66
|
+
code = bool(_CODE_RE.search(text))
|
|
67
|
+
math = bool(_MATH_RE.search(text))
|
|
68
|
+
hard = len(_HARD_VERBS.findall(text))
|
|
69
|
+
|
|
70
|
+
domain = "chat"
|
|
71
|
+
if code:
|
|
72
|
+
domain = "code"
|
|
73
|
+
elif math:
|
|
74
|
+
domain = "math"
|
|
75
|
+
elif hard:
|
|
76
|
+
domain = "reasoning"
|
|
77
|
+
|
|
78
|
+
# Very short and no complexity markers -> confidently trivial.
|
|
79
|
+
if n < 80 and not (code or math or hard):
|
|
80
|
+
return 1.5, domain, True
|
|
81
|
+
# Huge prompt with code/math and multiple hard verbs -> confidently hard.
|
|
82
|
+
if (n > 2500 or hard >= 3) and (code or math or hard >= 2):
|
|
83
|
+
return 4.5, domain, True
|
|
84
|
+
|
|
85
|
+
# Otherwise produce a prior but defer to the LLM.
|
|
86
|
+
prior = 2.0
|
|
87
|
+
prior += min(n / 1500.0, 1.5)
|
|
88
|
+
prior += 0.6 * min(hard, 3)
|
|
89
|
+
prior += 0.5 if code else 0.0
|
|
90
|
+
prior += 0.5 if math else 0.0
|
|
91
|
+
return max(1.0, min(prior, 5.0)), domain, False
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
_CLASSIFIER_SYS = (
|
|
95
|
+
"You are a fast request-difficulty classifier for an LLM router. "
|
|
96
|
+
"Read the user request and rate how powerful a model it needs. "
|
|
97
|
+
"Respond with ONLY a compact JSON object, no prose:\n"
|
|
98
|
+
'{"difficulty": <1-5 int>, "domain": "code|math|reasoning|factual|creative|chat|other"}\n'
|
|
99
|
+
"Scale: 1=trivial (greeting, lookup), 2=easy (short factual, simple rewrite), "
|
|
100
|
+
"3=moderate (normal coding/explanation), 4=hard (multi-step reasoning, non-trivial "
|
|
101
|
+
"code, careful analysis), 5=very hard (research-grade proof, complex system design)."
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
async def triage(gw: Gateway, messages: list[dict], *, use_llm: bool = True) -> Triage:
|
|
106
|
+
text = _last_user_text(messages)
|
|
107
|
+
prior, domain, confident = _heuristic(text)
|
|
108
|
+
|
|
109
|
+
if confident or not use_llm:
|
|
110
|
+
return Triage(
|
|
111
|
+
prior, domain, _tier_for(prior), "heuristic", note="heuristic-confident" if confident else "llm-disabled"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
prefix = text[:1800]
|
|
115
|
+
comp = await gw.complete(
|
|
116
|
+
config.CLASSIFIER_MODEL,
|
|
117
|
+
[
|
|
118
|
+
{"role": "system", "content": _CLASSIFIER_SYS},
|
|
119
|
+
{"role": "user", "content": f"Request prefix (len={len(text)} chars):\n{prefix}"},
|
|
120
|
+
],
|
|
121
|
+
max_tokens=40,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
difficulty, dom = prior, domain
|
|
125
|
+
if comp.ok:
|
|
126
|
+
try:
|
|
127
|
+
m = re.search(r"\{.*\}", comp.content, re.S)
|
|
128
|
+
obj = json.loads(m.group(0)) if m else {}
|
|
129
|
+
d = float(obj.get("difficulty", prior))
|
|
130
|
+
difficulty = 0.5 * d + 0.5 * prior # blend model judgement with prior
|
|
131
|
+
dom = obj.get("domain", domain) if obj.get("domain") in DOMAINS else domain
|
|
132
|
+
except Exception: # noqa: BLE001
|
|
133
|
+
difficulty = prior
|
|
134
|
+
|
|
135
|
+
return Triage(
|
|
136
|
+
difficulty=difficulty,
|
|
137
|
+
domain=dom,
|
|
138
|
+
tier=_tier_for(difficulty),
|
|
139
|
+
source="llm",
|
|
140
|
+
classifier_cost=comp.cost,
|
|
141
|
+
classifier_ms=comp.latency_ms,
|
|
142
|
+
)
|
switchboard/cli.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""Command-line interface for switchboard.
|
|
2
|
+
|
|
3
|
+
switchboard serve [--host H] [--port P] run the OpenAI-compatible server
|
|
4
|
+
switchboard ask "<prompt>" [--mode cost] route one prompt and print telemetry
|
|
5
|
+
switchboard models probe which gateway models are live
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import asyncio
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _cmd_serve(args: argparse.Namespace) -> int:
|
|
16
|
+
from switchboard.server import run
|
|
17
|
+
|
|
18
|
+
run(host=args.host, port=args.port)
|
|
19
|
+
return 0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _cmd_ask(args: argparse.Namespace) -> int:
|
|
23
|
+
from switchboard.engine import Engine
|
|
24
|
+
|
|
25
|
+
async def go() -> int:
|
|
26
|
+
eng = Engine()
|
|
27
|
+
try:
|
|
28
|
+
rr = await eng.answer(
|
|
29
|
+
[{"role": "user", "content": args.prompt}],
|
|
30
|
+
mode=args.mode,
|
|
31
|
+
max_tokens=args.max_tokens,
|
|
32
|
+
)
|
|
33
|
+
finally:
|
|
34
|
+
await eng.aclose()
|
|
35
|
+
print(rr.content)
|
|
36
|
+
print(
|
|
37
|
+
f"\n[route={rr.route} | cost=${rr.cost:.6f} "
|
|
38
|
+
f"| baseline(opus)≈${rr.baseline_cost_est:.6f} "
|
|
39
|
+
f"| savings={rr.savings_pct:.1f}% | {rr.latency_ms:.0f}ms]",
|
|
40
|
+
file=sys.stderr,
|
|
41
|
+
)
|
|
42
|
+
return 0
|
|
43
|
+
|
|
44
|
+
return asyncio.run(go())
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _cmd_models(args: argparse.Namespace) -> int:
|
|
48
|
+
from switchboard import config
|
|
49
|
+
from switchboard.gateway import Gateway
|
|
50
|
+
|
|
51
|
+
async def go() -> int:
|
|
52
|
+
pool = sorted(
|
|
53
|
+
set(
|
|
54
|
+
config.CHEAP
|
|
55
|
+
+ config.MID
|
|
56
|
+
+ config.STRONG
|
|
57
|
+
+ [config.CLASSIFIER_MODEL, config.JUDGE_MODEL, config.DEFAULT_CHEAP, config.DEFAULT_MID]
|
|
58
|
+
+ config.MOA_PROPOSERS
|
|
59
|
+
+ [config.MOA_SYNTHESIZER]
|
|
60
|
+
+ config.BASELINE_MODELS
|
|
61
|
+
)
|
|
62
|
+
)
|
|
63
|
+
gw = Gateway()
|
|
64
|
+
|
|
65
|
+
async def probe(m: str):
|
|
66
|
+
c = await gw.complete(m, [{"role": "user", "content": "Reply with one word: ok"}], max_tokens=300)
|
|
67
|
+
return m, c.ok, "" if c.ok else (c.error or "")[:90]
|
|
68
|
+
|
|
69
|
+
results = await asyncio.gather(*(probe(m) for m in pool))
|
|
70
|
+
await gw.aclose()
|
|
71
|
+
bad = [r for r in results if not r[1]]
|
|
72
|
+
for m, ok, err in results:
|
|
73
|
+
print(f" {'OK ' if ok else 'FAIL'} {m:<26}{'' if ok else ' <- ' + err}")
|
|
74
|
+
print(
|
|
75
|
+
f"\n{len(results) - len(bad)}/{len(results)} live."
|
|
76
|
+
+ (f" BROKEN: {[b[0] for b in bad]}" if bad else " all good.")
|
|
77
|
+
)
|
|
78
|
+
return 1 if bad else 0
|
|
79
|
+
|
|
80
|
+
return asyncio.run(go())
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def main(argv: list[str] | None = None) -> int:
|
|
84
|
+
parser = argparse.ArgumentParser(prog="switchboard", description="OpenAI-compatible LLM router.")
|
|
85
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
86
|
+
|
|
87
|
+
p_serve = sub.add_parser("serve", help="run the OpenAI-compatible server")
|
|
88
|
+
p_serve.add_argument("--host", default="0.0.0.0")
|
|
89
|
+
p_serve.add_argument("--port", type=int, default=8000)
|
|
90
|
+
p_serve.set_defaults(func=_cmd_serve)
|
|
91
|
+
|
|
92
|
+
p_ask = sub.add_parser("ask", help="route one prompt and print the answer + telemetry")
|
|
93
|
+
p_ask.add_argument("prompt")
|
|
94
|
+
p_ask.add_argument("--mode", default="balanced", choices=["balanced", "cost", "quality"])
|
|
95
|
+
p_ask.add_argument("--max-tokens", type=int, default=1024, dest="max_tokens")
|
|
96
|
+
p_ask.set_defaults(func=_cmd_ask)
|
|
97
|
+
|
|
98
|
+
p_models = sub.add_parser("models", help="probe which gateway models are actually live")
|
|
99
|
+
p_models.set_defaults(func=_cmd_models)
|
|
100
|
+
|
|
101
|
+
args = parser.parse_args(argv)
|
|
102
|
+
return args.func(args)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
if __name__ == "__main__":
|
|
106
|
+
raise SystemExit(main())
|
switchboard/config.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Model pool, pricing and routing tiers.
|
|
2
|
+
|
|
3
|
+
IMPORTANT: the prices below are *public list-price proxies* in USD per 1M
|
|
4
|
+
tokens. They are roughly correct relative to each other (which is what makes
|
|
5
|
+
the routing decisions sensible), but they are almost certainly NOT what your
|
|
6
|
+
gateway actually bills you. Drop your real rate card into `pricing.json`
|
|
7
|
+
in the repo root and it will override these at load time.
|
|
8
|
+
|
|
9
|
+
The whole point of the router is the *ratios* between tiers: a cheap model is
|
|
10
|
+
~10-50x cheaper than Opus, a mid model ~3-10x cheaper. As long as those ratios
|
|
11
|
+
hold, the cost story holds.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
|
|
20
|
+
# --------------------------------------------------------------------------- #
|
|
21
|
+
# Pricing: USD per 1,000,000 tokens (input, output).
|
|
22
|
+
# --------------------------------------------------------------------------- #
|
|
23
|
+
_DEFAULT_PRICING: dict[str, tuple[float, float]] = {
|
|
24
|
+
# cheap tier
|
|
25
|
+
"gemini-3.1-flash-lite": (0.10, 0.40),
|
|
26
|
+
"gpt-5-nano": (0.05, 0.40),
|
|
27
|
+
"claude-haiku-4-5": (1.00, 5.00),
|
|
28
|
+
# mid tier
|
|
29
|
+
"gemini-3.5-flash": (0.30, 2.50),
|
|
30
|
+
"gpt-5.4-mini": (0.25, 2.00),
|
|
31
|
+
"claude-sonnet-4-6": (3.00, 15.00),
|
|
32
|
+
# strong tier (escalation targets + baselines)
|
|
33
|
+
"claude-opus-4-8": (15.00, 75.00),
|
|
34
|
+
"claude-fable-5": (15.00, 75.00), # placeholder — real price unknown
|
|
35
|
+
"gpt-5.5": (1.25, 10.00),
|
|
36
|
+
"gemini-3.1-pro-preview": (1.25, 10.00),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _load_pricing() -> dict[str, tuple[float, float]]:
|
|
41
|
+
pricing = dict(_DEFAULT_PRICING)
|
|
42
|
+
path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "pricing.json")
|
|
43
|
+
if os.path.exists(path):
|
|
44
|
+
try:
|
|
45
|
+
with open(path) as f:
|
|
46
|
+
override = json.load(f)
|
|
47
|
+
for k, v in override.items():
|
|
48
|
+
pricing[k] = (float(v[0]), float(v[1]))
|
|
49
|
+
except Exception as e: # noqa: BLE001 - best effort, never crash on bad file
|
|
50
|
+
print(f"[config] warning: could not load pricing.json: {e}")
|
|
51
|
+
return pricing
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
PRICING = _load_pricing()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def price_of(model: str) -> tuple[float, float]:
|
|
58
|
+
"""(input, output) USD per 1M tokens. Falls back to a mid-tier guess."""
|
|
59
|
+
return PRICING.get(model, (1.0, 5.0))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def cost_usd(model: str, prompt_tokens: int, completion_tokens: int) -> float:
|
|
63
|
+
pin, pout = price_of(model)
|
|
64
|
+
return (prompt_tokens / 1e6) * pin + (completion_tokens / 1e6) * pout
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# --------------------------------------------------------------------------- #
|
|
68
|
+
# Tiers and pools. Edit these to change routing behaviour.
|
|
69
|
+
# --------------------------------------------------------------------------- #
|
|
70
|
+
# NOTE: the gateway's /v1/models list is stale — it advertises some ids that
|
|
71
|
+
# 404 at call time (e.g. gemini-3-pro-preview, claude-fable-5 -> "use Opus 4.8").
|
|
72
|
+
# Every model below has been verified to actually answer. Run
|
|
73
|
+
# `switchboard models` to re-check after any edit.
|
|
74
|
+
CHEAP = ["gemini-3.1-flash-lite", "gpt-5-nano", "claude-haiku-4-5"]
|
|
75
|
+
MID = ["gemini-3.5-flash", "gpt-5.4-mini", "claude-sonnet-4-6"]
|
|
76
|
+
STRONG = ["claude-opus-4-8", "gpt-5.5", "gemini-3.1-pro-preview"]
|
|
77
|
+
|
|
78
|
+
# Reliable, cheap, low-latency model used for triage + judging. Gemini
|
|
79
|
+
# flash-lite was the most reliable for short structured outputs in testing
|
|
80
|
+
# (gpt-5-nano spent its whole budget on hidden reasoning and returned empty).
|
|
81
|
+
CLASSIFIER_MODEL = "gemini-3.1-flash-lite"
|
|
82
|
+
JUDGE_MODEL = "gemini-3.1-flash-lite"
|
|
83
|
+
|
|
84
|
+
# Default single-model picks per tier.
|
|
85
|
+
DEFAULT_CHEAP = "gemini-3.1-flash-lite"
|
|
86
|
+
DEFAULT_MID = "gemini-3.5-flash"
|
|
87
|
+
|
|
88
|
+
# Mixture-of-Agents (used for the hard tier). Diverse *providers* give
|
|
89
|
+
# diversity without needing temperature (gpt-5 reasoning models reject
|
|
90
|
+
# non-default temperature anyway).
|
|
91
|
+
MOA_PROPOSERS = ["gemini-3.5-flash", "gpt-5.4-mini", "claude-haiku-4-5"]
|
|
92
|
+
MOA_SYNTHESIZER = "claude-sonnet-4-6"
|
|
93
|
+
|
|
94
|
+
# Baselines we benchmark the router against ("always use the big model").
|
|
95
|
+
# claude-fable-5 is advertised by the gateway but 404s ("use Opus 4.8"), so the
|
|
96
|
+
# practical frontier baseline here is Opus 4.8.
|
|
97
|
+
BASELINE_MODELS = ["claude-opus-4-8"]
|
|
98
|
+
PRIMARY_BASELINE = "claude-opus-4-8"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@dataclass
|
|
102
|
+
class GatewayConfig:
|
|
103
|
+
base_url: str
|
|
104
|
+
api_key: str
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def from_env(cls) -> GatewayConfig:
|
|
108
|
+
base = os.environ.get("OPENAI_BASE_URL", "").rstrip("/")
|
|
109
|
+
key = os.environ.get("OPENAI_API_KEY", "")
|
|
110
|
+
if not base or not key:
|
|
111
|
+
raise RuntimeError("OPENAI_BASE_URL and OPENAI_API_KEY must be set in the environment.")
|
|
112
|
+
return cls(base_url=base, api_key=key)
|
switchboard/engine.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
"""The router engine: triage -> policy -> execute -> telemetry.
|
|
2
|
+
|
|
3
|
+
Three strategies are composed by the policy:
|
|
4
|
+
|
|
5
|
+
single one model answers (trivial / moderate traffic — the common case).
|
|
6
|
+
moa Mixture-of-Agents: N diverse models propose in parallel, a
|
|
7
|
+
synthesizer fuses them. This is the lever that can *beat* a single
|
|
8
|
+
frontier model on hard queries, at well below frontier cost.
|
|
9
|
+
cascade FrugalGPT-style: answer cheap, a cheap judge scores it, escalate
|
|
10
|
+
only if the score is low. Minimises spend on the easy majority.
|
|
11
|
+
|
|
12
|
+
Every result reports its internal cost and an estimate of what always-Opus
|
|
13
|
+
would have cost, so savings are measured, not asserted.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import json
|
|
20
|
+
import re
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
|
|
23
|
+
from . import config
|
|
24
|
+
from .cache import ResponseCache
|
|
25
|
+
from .classify import Triage, triage
|
|
26
|
+
from .gateway import Completion, Gateway
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class RouteResult:
|
|
31
|
+
content: str
|
|
32
|
+
route: str
|
|
33
|
+
tier: str
|
|
34
|
+
models_used: list[str]
|
|
35
|
+
prompt_tokens: int
|
|
36
|
+
completion_tokens: int
|
|
37
|
+
cost: float
|
|
38
|
+
baseline_cost_est: float
|
|
39
|
+
savings_pct: float
|
|
40
|
+
latency_ms: float
|
|
41
|
+
cached: bool
|
|
42
|
+
triage: Triage
|
|
43
|
+
steps: list[dict] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
_JUDGE_SYS = (
|
|
47
|
+
"You are a strict answer-quality judge. Given a user request and a candidate "
|
|
48
|
+
"answer, rate how fully and correctly the answer satisfies the request. "
|
|
49
|
+
'Respond ONLY with JSON: {"score": <0.0-1.0>, "reason": "<short>"}. '
|
|
50
|
+
"0.0 = wrong/empty/evasive, 1.0 = complete and correct."
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
_SYNTH_SYS = (
|
|
54
|
+
"You are an expert synthesizer in a Mixture-of-Agents system. You are given a "
|
|
55
|
+
"user request and several candidate answers from different models. The "
|
|
56
|
+
"candidates may be uneven or partly wrong. Critically compare them, discard "
|
|
57
|
+
"errors, and produce a single best answer that is more accurate and complete "
|
|
58
|
+
"than any individual candidate. Do not mention the candidates or the process; "
|
|
59
|
+
"just give the final answer to the user."
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _user_request_text(messages: list[dict]) -> str:
|
|
64
|
+
parts = []
|
|
65
|
+
for m in messages:
|
|
66
|
+
role = m.get("role")
|
|
67
|
+
c = m.get("content")
|
|
68
|
+
if isinstance(c, list):
|
|
69
|
+
c = " ".join(p.get("text", "") for p in c if isinstance(p, dict))
|
|
70
|
+
if role in ("user", "system") and c:
|
|
71
|
+
parts.append(f"{role}: {c}")
|
|
72
|
+
return "\n".join(parts)[:6000]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class Engine:
|
|
76
|
+
def __init__(self, gateway: Gateway | None = None, cache: ResponseCache | None = None):
|
|
77
|
+
self.gw = gateway or Gateway()
|
|
78
|
+
self.cache = cache if cache is not None else ResponseCache()
|
|
79
|
+
|
|
80
|
+
async def aclose(self) -> None:
|
|
81
|
+
await self.gw.aclose()
|
|
82
|
+
|
|
83
|
+
# ---- primitives ------------------------------------------------------- #
|
|
84
|
+
async def _single(self, model: str, messages: list[dict], max_tokens: int) -> Completion:
|
|
85
|
+
return await self.gw.complete(model, messages, max_tokens=max_tokens)
|
|
86
|
+
|
|
87
|
+
async def _judge(self, messages: list[dict], answer: str) -> tuple[float, Completion]:
|
|
88
|
+
req = _user_request_text(messages)
|
|
89
|
+
comp = await self.gw.complete(
|
|
90
|
+
config.JUDGE_MODEL,
|
|
91
|
+
[
|
|
92
|
+
{"role": "system", "content": _JUDGE_SYS},
|
|
93
|
+
{"role": "user", "content": f"REQUEST:\n{req}\n\nANSWER:\n{answer[:4000]}"},
|
|
94
|
+
],
|
|
95
|
+
max_tokens=60,
|
|
96
|
+
)
|
|
97
|
+
score = 0.5
|
|
98
|
+
if comp.ok:
|
|
99
|
+
try:
|
|
100
|
+
m = re.search(r"\{.*\}", comp.content, re.S)
|
|
101
|
+
score = float(json.loads(m.group(0)).get("score", 0.5)) if m else 0.5
|
|
102
|
+
except Exception: # noqa: BLE001
|
|
103
|
+
score = 0.5
|
|
104
|
+
return max(0.0, min(score, 1.0)), comp
|
|
105
|
+
|
|
106
|
+
async def _moa(
|
|
107
|
+
self, messages: list[dict], proposers: list[str], synthesizer: str, max_tokens: int
|
|
108
|
+
) -> tuple[Completion, list[Completion]]:
|
|
109
|
+
# Fan out proposers concurrently — this is the parallelism that keeps MoA
|
|
110
|
+
# from being N times slower; wall-clock ≈ slowest proposer + synthesizer.
|
|
111
|
+
proposals = await asyncio.gather(*(self.gw.complete(m, messages, max_tokens=max_tokens) for m in proposers))
|
|
112
|
+
good = [p for p in proposals if p.ok]
|
|
113
|
+
if not good:
|
|
114
|
+
# everything failed; fall back to a single mid model
|
|
115
|
+
fb = await self._single(config.DEFAULT_MID, messages, max_tokens)
|
|
116
|
+
return fb, list(proposals)
|
|
117
|
+
|
|
118
|
+
req = _user_request_text(messages)
|
|
119
|
+
bundle = "\n\n".join(f"--- Candidate {i + 1} (model {p.model}) ---\n{p.content}" for i, p in enumerate(good))
|
|
120
|
+
synth = await self.gw.complete(
|
|
121
|
+
synthesizer,
|
|
122
|
+
[
|
|
123
|
+
{"role": "system", "content": _SYNTH_SYS},
|
|
124
|
+
{"role": "user", "content": f"USER REQUEST:\n{req}\n\nCANDIDATE ANSWERS:\n{bundle}"},
|
|
125
|
+
],
|
|
126
|
+
max_tokens=max_tokens,
|
|
127
|
+
)
|
|
128
|
+
return synth, list(proposals)
|
|
129
|
+
|
|
130
|
+
# ---- top-level -------------------------------------------------------- #
|
|
131
|
+
async def answer(
|
|
132
|
+
self,
|
|
133
|
+
messages: list[dict],
|
|
134
|
+
*,
|
|
135
|
+
mode: str = "balanced",
|
|
136
|
+
max_tokens: int = 1024,
|
|
137
|
+
use_llm_triage: bool = True,
|
|
138
|
+
use_cache: bool = True,
|
|
139
|
+
) -> RouteResult:
|
|
140
|
+
loop = asyncio.get_event_loop()
|
|
141
|
+
t_start = loop.time()
|
|
142
|
+
|
|
143
|
+
if use_cache:
|
|
144
|
+
cached = self.cache.get(messages, mode)
|
|
145
|
+
if cached is not None:
|
|
146
|
+
c: RouteResult = cached
|
|
147
|
+
return RouteResult(**{**c.__dict__, "cached": True, "latency_ms": 0.0})
|
|
148
|
+
|
|
149
|
+
tri = await triage(self.gw, messages, use_llm=use_llm_triage)
|
|
150
|
+
steps: list[dict] = [{"stage": "triage", **_triage_dict(tri)}]
|
|
151
|
+
|
|
152
|
+
spent = tri.classifier_cost
|
|
153
|
+
models_used: list[str] = []
|
|
154
|
+
|
|
155
|
+
# ------------------------------------------------------------------ #
|
|
156
|
+
# Policy
|
|
157
|
+
# ------------------------------------------------------------------ #
|
|
158
|
+
if mode == "cost":
|
|
159
|
+
final, extra_cost, used, route = await self._cascade(messages, tri, max_tokens, steps)
|
|
160
|
+
spent += extra_cost
|
|
161
|
+
models_used += used
|
|
162
|
+
else:
|
|
163
|
+
tier = tri.tier
|
|
164
|
+
if mode == "quality": # bias one tier up
|
|
165
|
+
tier = {"cheap": "mid", "mid": "hard", "hard": "hard"}[tier]
|
|
166
|
+
|
|
167
|
+
if tier == "cheap":
|
|
168
|
+
comp = await self._single(config.DEFAULT_CHEAP, messages, max_tokens)
|
|
169
|
+
final, route, used = comp, f"single:{config.DEFAULT_CHEAP}", [config.DEFAULT_CHEAP]
|
|
170
|
+
spent += comp.cost
|
|
171
|
+
steps.append(_step("single", comp))
|
|
172
|
+
elif tier == "mid":
|
|
173
|
+
comp = await self._single(config.DEFAULT_MID, messages, max_tokens)
|
|
174
|
+
final, route, used = comp, f"single:{config.DEFAULT_MID}", [config.DEFAULT_MID]
|
|
175
|
+
spent += comp.cost
|
|
176
|
+
steps.append(_step("single", comp))
|
|
177
|
+
else: # hard -> Mixture-of-Agents
|
|
178
|
+
synth, props = await self._moa(messages, config.MOA_PROPOSERS, config.MOA_SYNTHESIZER, max_tokens)
|
|
179
|
+
used = [p.model for p in props] + [config.MOA_SYNTHESIZER]
|
|
180
|
+
route = f"moa[{'+'.join(config.MOA_PROPOSERS)}]->{config.MOA_SYNTHESIZER}"
|
|
181
|
+
spent += sum(p.cost for p in props) + synth.cost
|
|
182
|
+
for p in props:
|
|
183
|
+
steps.append(_step("moa-proposer", p))
|
|
184
|
+
steps.append(_step("moa-synth", synth))
|
|
185
|
+
final = synth
|
|
186
|
+
models_used += used
|
|
187
|
+
|
|
188
|
+
# ------------------------------------------------------------------ #
|
|
189
|
+
# Telemetry + baseline comparison
|
|
190
|
+
# ------------------------------------------------------------------ #
|
|
191
|
+
rep_pt = final.prompt_tokens or _est_tokens(messages)
|
|
192
|
+
rep_ct = final.completion_tokens or _est_tokens_text(final.content)
|
|
193
|
+
baseline = config.cost_usd(config.PRIMARY_BASELINE, rep_pt, rep_ct)
|
|
194
|
+
savings = (1 - spent / baseline) * 100 if baseline > 0 else 0.0
|
|
195
|
+
|
|
196
|
+
result = RouteResult(
|
|
197
|
+
content=final.content,
|
|
198
|
+
route=route,
|
|
199
|
+
tier=tri.tier,
|
|
200
|
+
models_used=models_used,
|
|
201
|
+
prompt_tokens=rep_pt,
|
|
202
|
+
completion_tokens=rep_ct,
|
|
203
|
+
cost=spent,
|
|
204
|
+
baseline_cost_est=baseline,
|
|
205
|
+
savings_pct=savings,
|
|
206
|
+
latency_ms=(loop.time() - t_start) * 1000,
|
|
207
|
+
cached=False,
|
|
208
|
+
triage=tri,
|
|
209
|
+
steps=steps,
|
|
210
|
+
)
|
|
211
|
+
if use_cache and final.ok:
|
|
212
|
+
self.cache.put(messages, mode, result)
|
|
213
|
+
return result
|
|
214
|
+
|
|
215
|
+
async def _cascade(
|
|
216
|
+
self, messages: list[dict], tri: Triage, max_tokens: int, steps: list[dict]
|
|
217
|
+
) -> tuple[Completion, float, list[str], str]:
|
|
218
|
+
"""FrugalGPT cascade: cheap -> judge -> mid -> judge -> MoA."""
|
|
219
|
+
spent = 0.0
|
|
220
|
+
used: list[str] = []
|
|
221
|
+
|
|
222
|
+
c1 = await self._single(config.DEFAULT_CHEAP, messages, max_tokens)
|
|
223
|
+
spent += c1.cost
|
|
224
|
+
used.append(config.DEFAULT_CHEAP)
|
|
225
|
+
steps.append(_step("cascade-cheap", c1))
|
|
226
|
+
s1, j1 = await self._judge(messages, c1.content)
|
|
227
|
+
spent += j1.cost
|
|
228
|
+
steps.append({"stage": "cascade-judge", "score": round(s1, 3), "cost": j1.cost})
|
|
229
|
+
if c1.ok and s1 >= 0.70:
|
|
230
|
+
return c1, spent, used, f"cascade:single:{config.DEFAULT_CHEAP}(score={s1:.2f})"
|
|
231
|
+
|
|
232
|
+
c2 = await self._single(config.DEFAULT_MID, messages, max_tokens)
|
|
233
|
+
spent += c2.cost
|
|
234
|
+
used.append(config.DEFAULT_MID)
|
|
235
|
+
steps.append(_step("cascade-mid", c2))
|
|
236
|
+
s2, j2 = await self._judge(messages, c2.content)
|
|
237
|
+
spent += j2.cost
|
|
238
|
+
steps.append({"stage": "cascade-judge", "score": round(s2, 3), "cost": j2.cost})
|
|
239
|
+
if c2.ok and s2 >= 0.65:
|
|
240
|
+
return c2, spent, used, f"cascade:single:{config.DEFAULT_MID}(score={s2:.2f})"
|
|
241
|
+
|
|
242
|
+
synth, props = await self._moa(messages, config.MOA_PROPOSERS, config.MOA_SYNTHESIZER, max_tokens)
|
|
243
|
+
spent += sum(p.cost for p in props) + synth.cost
|
|
244
|
+
used += [p.model for p in props] + [config.MOA_SYNTHESIZER]
|
|
245
|
+
for p in props:
|
|
246
|
+
steps.append(_step("cascade-moa-proposer", p))
|
|
247
|
+
steps.append(_step("cascade-moa-synth", synth))
|
|
248
|
+
return synth, spent, used, f"cascade:moa->{config.MOA_SYNTHESIZER}"
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
# --------------------------------------------------------------------------- #
|
|
252
|
+
# helpers
|
|
253
|
+
# --------------------------------------------------------------------------- #
|
|
254
|
+
def _est_tokens(messages: list[dict]) -> int:
|
|
255
|
+
txt = _user_request_text(messages)
|
|
256
|
+
return max(1, len(txt) // 4)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _est_tokens_text(text: str) -> int:
|
|
260
|
+
return max(1, len(text) // 4)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _triage_dict(t: Triage) -> dict:
|
|
264
|
+
return {
|
|
265
|
+
"difficulty": round(t.difficulty, 2),
|
|
266
|
+
"domain": t.domain,
|
|
267
|
+
"tier": t.tier,
|
|
268
|
+
"source": t.source,
|
|
269
|
+
"cost": round(t.classifier_cost, 8),
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def _step(stage: str, c: Completion) -> dict:
|
|
274
|
+
return {
|
|
275
|
+
"stage": stage,
|
|
276
|
+
"model": c.model,
|
|
277
|
+
"ok": c.ok,
|
|
278
|
+
"prompt_tokens": c.prompt_tokens,
|
|
279
|
+
"completion_tokens": c.completion_tokens,
|
|
280
|
+
"cost": c.cost,
|
|
281
|
+
"latency_ms": round(c.latency_ms, 1),
|
|
282
|
+
"error": c.error,
|
|
283
|
+
}
|
switchboard/gateway.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Async client for the OpenAI-compatible gateway.
|
|
2
|
+
|
|
3
|
+
One client addresses every model — OpenAI, Anthropic and Google — by just
|
|
4
|
+
swapping the `model` field. This is what makes the router simple: it is just a
|
|
5
|
+
policy on top of a single `complete()` call.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
|
|
15
|
+
from . import config
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class Completion:
|
|
20
|
+
model: str
|
|
21
|
+
content: str
|
|
22
|
+
prompt_tokens: int
|
|
23
|
+
completion_tokens: int
|
|
24
|
+
cost: float
|
|
25
|
+
latency_ms: float
|
|
26
|
+
finish_reason: str = ""
|
|
27
|
+
error: str | None = None
|
|
28
|
+
raw: dict = field(default_factory=dict, repr=False)
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def ok(self) -> bool:
|
|
32
|
+
return self.error is None and bool(self.content.strip())
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# gpt-5* reasoning models bill hidden reasoning tokens against the completion
|
|
36
|
+
# budget and return empty content if the budget is too small. We give them
|
|
37
|
+
# headroom and retry once with a bigger budget if they come back empty.
|
|
38
|
+
def _is_reasoning_model(model: str) -> bool:
|
|
39
|
+
return model.startswith("gpt-5") or model.startswith("o3") or model.startswith("o4")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Gateway:
|
|
43
|
+
def __init__(self, cfg: config.GatewayConfig | None = None, *, concurrency: int = 24):
|
|
44
|
+
self.cfg = cfg or config.GatewayConfig.from_env()
|
|
45
|
+
self._client = httpx.AsyncClient(
|
|
46
|
+
base_url=self.cfg.base_url,
|
|
47
|
+
headers={
|
|
48
|
+
"Authorization": f"Bearer {self.cfg.api_key}",
|
|
49
|
+
"Content-Type": "application/json",
|
|
50
|
+
},
|
|
51
|
+
timeout=httpx.Timeout(120.0, connect=10.0),
|
|
52
|
+
limits=httpx.Limits(max_connections=concurrency),
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
async def aclose(self) -> None:
|
|
56
|
+
await self._client.aclose()
|
|
57
|
+
|
|
58
|
+
async def complete(
|
|
59
|
+
self,
|
|
60
|
+
model: str,
|
|
61
|
+
messages: list[dict],
|
|
62
|
+
*,
|
|
63
|
+
max_tokens: int = 1024,
|
|
64
|
+
temperature: float | None = None,
|
|
65
|
+
retries: int = 2,
|
|
66
|
+
) -> Completion:
|
|
67
|
+
# gpt-5 reasoning models need extra headroom; everyone gets at least the ask.
|
|
68
|
+
budget = max_tokens
|
|
69
|
+
if _is_reasoning_model(model):
|
|
70
|
+
budget = max(max_tokens, 1024) + 1024 # room for hidden reasoning
|
|
71
|
+
|
|
72
|
+
last_err = "unknown error"
|
|
73
|
+
for attempt in range(retries + 1):
|
|
74
|
+
payload: dict = {
|
|
75
|
+
"model": model,
|
|
76
|
+
"messages": messages,
|
|
77
|
+
"max_completion_tokens": budget,
|
|
78
|
+
}
|
|
79
|
+
# gpt-5 reasoning models reject non-default temperature.
|
|
80
|
+
if temperature is not None and not _is_reasoning_model(model):
|
|
81
|
+
payload["temperature"] = temperature
|
|
82
|
+
|
|
83
|
+
t0 = time.perf_counter()
|
|
84
|
+
try:
|
|
85
|
+
r = await self._client.post("/chat/completions", json=payload)
|
|
86
|
+
dt = (time.perf_counter() - t0) * 1000
|
|
87
|
+
if r.status_code != 200:
|
|
88
|
+
last_err = f"HTTP {r.status_code}: {r.text[:200]}"
|
|
89
|
+
# 4xx other than rate limit won't fix on retry
|
|
90
|
+
if r.status_code not in (408, 409, 425, 429, 500, 502, 503, 504):
|
|
91
|
+
return Completion(model, "", 0, 0, 0.0, dt, error=last_err)
|
|
92
|
+
continue
|
|
93
|
+
data = r.json()
|
|
94
|
+
except Exception as e: # noqa: BLE001
|
|
95
|
+
dt = (time.perf_counter() - t0) * 1000
|
|
96
|
+
last_err = f"{type(e).__name__}: {e}"
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
choice = (data.get("choices") or [{}])[0]
|
|
100
|
+
content = (choice.get("message") or {}).get("content") or ""
|
|
101
|
+
finish = choice.get("finish_reason") or ""
|
|
102
|
+
usage = data.get("usage") or {}
|
|
103
|
+
pt = int(usage.get("prompt_tokens", 0))
|
|
104
|
+
ct = int(usage.get("completion_tokens", 0))
|
|
105
|
+
cost = config.cost_usd(model, pt, ct)
|
|
106
|
+
|
|
107
|
+
# Empty content because the reasoning model ran out of budget — retry bigger.
|
|
108
|
+
if not content.strip() and finish == "length" and attempt < retries:
|
|
109
|
+
budget *= 2
|
|
110
|
+
last_err = "empty content (finish_reason=length)"
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
return Completion(
|
|
114
|
+
model=model,
|
|
115
|
+
content=content,
|
|
116
|
+
prompt_tokens=pt,
|
|
117
|
+
completion_tokens=ct,
|
|
118
|
+
cost=cost,
|
|
119
|
+
latency_ms=dt,
|
|
120
|
+
finish_reason=finish,
|
|
121
|
+
raw=data,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return Completion(model, "", 0, 0, 0.0, 0.0, error=last_err)
|
switchboard/py.typed
ADDED
|
File without changes
|
switchboard/server.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""OpenAI-compatible HTTP server in front of the router.
|
|
2
|
+
|
|
3
|
+
Point any OpenAI client at it and it Just Works::
|
|
4
|
+
|
|
5
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
6
|
+
client.chat.completions.create(model="router", messages=[...])
|
|
7
|
+
|
|
8
|
+
Virtual models:
|
|
9
|
+
router / router-balanced triage -> single (easy) or Mixture-of-Agents (hard)
|
|
10
|
+
router-cost FrugalGPT cascade (cheapest; escalate on low score)
|
|
11
|
+
router-quality bias one tier up (best quality under Opus cost)
|
|
12
|
+
|
|
13
|
+
Any *real* model id (e.g. "claude-opus-4-8", "gemini-3-pro-preview") is passed
|
|
14
|
+
straight through to the gateway, so this also works as a plain proxy.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import time
|
|
21
|
+
import uuid
|
|
22
|
+
from contextlib import asynccontextmanager
|
|
23
|
+
|
|
24
|
+
from fastapi import FastAPI, Request
|
|
25
|
+
from fastapi.responses import JSONResponse, StreamingResponse
|
|
26
|
+
|
|
27
|
+
from switchboard.engine import Engine
|
|
28
|
+
|
|
29
|
+
MODE_BY_MODEL = {
|
|
30
|
+
"router": "balanced",
|
|
31
|
+
"router-balanced": "balanced",
|
|
32
|
+
"router-cost": "cost",
|
|
33
|
+
"router-quality": "quality",
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Key under which router telemetry is attached to each response (ignored by
|
|
37
|
+
# standard OpenAI clients).
|
|
38
|
+
META_KEY = "switchboard"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@asynccontextmanager
|
|
42
|
+
async def _lifespan(app: FastAPI):
|
|
43
|
+
app.state.engine = Engine()
|
|
44
|
+
try:
|
|
45
|
+
yield
|
|
46
|
+
finally:
|
|
47
|
+
await app.state.engine.aclose()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def create_app() -> FastAPI:
|
|
51
|
+
app = FastAPI(title="switchboard", version="0.1.0", lifespan=_lifespan)
|
|
52
|
+
|
|
53
|
+
@app.get("/healthz")
|
|
54
|
+
async def healthz() -> dict:
|
|
55
|
+
return {"ok": True}
|
|
56
|
+
|
|
57
|
+
@app.get("/v1/models")
|
|
58
|
+
async def models() -> JSONResponse:
|
|
59
|
+
virtual = [{"id": m, "object": "model", "owned_by": "switchboard"} for m in MODE_BY_MODEL]
|
|
60
|
+
return JSONResponse({"object": "list", "data": virtual})
|
|
61
|
+
|
|
62
|
+
@app.post("/v1/chat/completions")
|
|
63
|
+
async def chat_completions(request: Request):
|
|
64
|
+
engine: Engine = request.app.state.engine
|
|
65
|
+
body = await request.json()
|
|
66
|
+
model = body.get("model", "router")
|
|
67
|
+
messages = body.get("messages", [])
|
|
68
|
+
stream = bool(body.get("stream", False))
|
|
69
|
+
max_tokens = int(body.get("max_completion_tokens") or body.get("max_tokens") or 1024)
|
|
70
|
+
|
|
71
|
+
# Real model id -> transparent proxy.
|
|
72
|
+
if model not in MODE_BY_MODEL:
|
|
73
|
+
comp = await engine.gw.complete(model, messages, max_tokens=max_tokens, temperature=body.get("temperature"))
|
|
74
|
+
if not comp.ok and comp.error:
|
|
75
|
+
return JSONResponse({"error": {"message": comp.error}}, status_code=502)
|
|
76
|
+
shim = _ProxyResult(comp, model)
|
|
77
|
+
if stream:
|
|
78
|
+
return StreamingResponse(_sse_chunks(shim, model), media_type="text/event-stream")
|
|
79
|
+
return JSONResponse(_openai_response(shim, model))
|
|
80
|
+
|
|
81
|
+
rr = await engine.answer(messages, mode=MODE_BY_MODEL[model], max_tokens=max_tokens)
|
|
82
|
+
if stream:
|
|
83
|
+
return StreamingResponse(_sse_chunks(rr, model), media_type="text/event-stream")
|
|
84
|
+
return JSONResponse(_openai_response(rr, model))
|
|
85
|
+
|
|
86
|
+
return app
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class _ProxyResult:
|
|
90
|
+
"""Adapts a raw Completion to the RouteResult-ish shape the formatter expects."""
|
|
91
|
+
|
|
92
|
+
def __init__(self, comp, model: str):
|
|
93
|
+
self.content = comp.content
|
|
94
|
+
self.route = f"proxy:{model}"
|
|
95
|
+
self.tier = "n/a"
|
|
96
|
+
self.models_used = [model]
|
|
97
|
+
self.prompt_tokens = comp.prompt_tokens
|
|
98
|
+
self.completion_tokens = comp.completion_tokens
|
|
99
|
+
self.cost = comp.cost
|
|
100
|
+
self.baseline_cost_est = comp.cost
|
|
101
|
+
self.savings_pct = 0.0
|
|
102
|
+
self.latency_ms = comp.latency_ms
|
|
103
|
+
self.cached = False
|
|
104
|
+
self.steps: list = []
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _openai_response(rr, model_label: str) -> dict:
|
|
108
|
+
return {
|
|
109
|
+
"id": f"chatcmpl-{uuid.uuid4().hex[:24]}",
|
|
110
|
+
"object": "chat.completion",
|
|
111
|
+
"created": int(time.time()),
|
|
112
|
+
"model": model_label,
|
|
113
|
+
"choices": [
|
|
114
|
+
{
|
|
115
|
+
"index": 0,
|
|
116
|
+
"message": {"role": "assistant", "content": rr.content},
|
|
117
|
+
"finish_reason": "stop",
|
|
118
|
+
}
|
|
119
|
+
],
|
|
120
|
+
"usage": {
|
|
121
|
+
"prompt_tokens": rr.prompt_tokens,
|
|
122
|
+
"completion_tokens": rr.completion_tokens,
|
|
123
|
+
"total_tokens": rr.prompt_tokens + rr.completion_tokens,
|
|
124
|
+
},
|
|
125
|
+
META_KEY: {
|
|
126
|
+
"route": rr.route,
|
|
127
|
+
"tier": rr.tier,
|
|
128
|
+
"models_used": rr.models_used,
|
|
129
|
+
"cost_usd": round(rr.cost, 8),
|
|
130
|
+
"baseline_opus_cost_usd": round(rr.baseline_cost_est, 8),
|
|
131
|
+
"savings_pct": round(rr.savings_pct, 1),
|
|
132
|
+
"latency_ms": round(rr.latency_ms, 1),
|
|
133
|
+
"cached": rr.cached,
|
|
134
|
+
"steps": rr.steps,
|
|
135
|
+
},
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _sse_chunks(rr, model_label: str):
|
|
140
|
+
cid = f"chatcmpl-{uuid.uuid4().hex[:24]}"
|
|
141
|
+
created = int(time.time())
|
|
142
|
+
|
|
143
|
+
def frame(delta: dict, finish=None) -> str:
|
|
144
|
+
return (
|
|
145
|
+
"data: "
|
|
146
|
+
+ json.dumps(
|
|
147
|
+
{
|
|
148
|
+
"id": cid,
|
|
149
|
+
"object": "chat.completion.chunk",
|
|
150
|
+
"created": created,
|
|
151
|
+
"model": model_label,
|
|
152
|
+
"choices": [{"index": 0, "delta": delta, "finish_reason": finish}],
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
+ "\n\n"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
yield frame({"role": "assistant"})
|
|
159
|
+
# We compute the full answer first (MoA can't truly stream), then chunk it
|
|
160
|
+
# so streaming clients still work.
|
|
161
|
+
text = rr.content
|
|
162
|
+
step = max(1, len(text) // 40)
|
|
163
|
+
for i in range(0, len(text), step):
|
|
164
|
+
yield frame({"content": text[i : i + step]})
|
|
165
|
+
yield frame({}, finish="stop")
|
|
166
|
+
yield (
|
|
167
|
+
"data: "
|
|
168
|
+
+ json.dumps(
|
|
169
|
+
{
|
|
170
|
+
META_KEY: {
|
|
171
|
+
"route": rr.route,
|
|
172
|
+
"cost_usd": round(rr.cost, 8),
|
|
173
|
+
"baseline_opus_cost_usd": round(rr.baseline_cost_est, 8),
|
|
174
|
+
"savings_pct": round(rr.savings_pct, 1),
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
)
|
|
178
|
+
+ "\n\n"
|
|
179
|
+
)
|
|
180
|
+
yield "data: [DONE]\n\n"
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# Module-level app for `uvicorn switchboard.server:app`.
|
|
184
|
+
app = create_app()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def run(host: str = "0.0.0.0", port: int = 8000) -> None:
|
|
188
|
+
import uvicorn
|
|
189
|
+
|
|
190
|
+
uvicorn.run(app, host=host, port=port)
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: switchboard-llm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An OpenAI-compatible LLM router that saves cost without losing quality.
|
|
5
|
+
Project-URL: Homepage, https://github.com/archit0/switchboard
|
|
6
|
+
Project-URL: Repository, https://github.com/archit0/switchboard
|
|
7
|
+
Project-URL: Issues, https://github.com/archit0/switchboard/issues
|
|
8
|
+
Author: Archit Dwivedi
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: anthropic,cost,frugalgpt,gateway,gemini,llm,mixture-of-agents,openai,router
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: fastapi>=0.110
|
|
21
|
+
Requires-Dist: httpx>=0.27
|
|
22
|
+
Requires-Dist: uvicorn[standard]>=0.29
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# switchboard
|
|
26
|
+
|
|
27
|
+
[](https://github.com/archit0/switchboard/actions/workflows/ci.yml)
|
|
28
|
+
[](https://pypi.org/project/switchboard-llm/)
|
|
29
|
+
|
|
30
|
+
An **OpenAI-compatible LLM router** that saves cost without losing quality. Point
|
|
31
|
+
any OpenAI client at it and it routes each request to the cheapest model that can
|
|
32
|
+
handle it — easy prompts to a small model, hard ones to a parallel
|
|
33
|
+
**Mixture-of-Agents** — trading a little latency for large savings while holding
|
|
34
|
+
(or beating) frontier-model quality on a representative workload.
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from openai import OpenAI
|
|
38
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
39
|
+
client.chat.completions.create(model="router-cost", messages=[{"role": "user", "content": "..."}])
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
It works on top of **any OpenAI-compatible gateway that fronts multiple providers**
|
|
43
|
+
behind one key (e.g. a LiteLLM proxy) — so one client can reach OpenAI, Anthropic,
|
|
44
|
+
and Google models just by changing the `model` field. The router is a thin policy
|
|
45
|
+
on top of that.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Install
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install switchboard-llm # or: uv add switchboard-llm
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Configure your gateway (any OpenAI-compatible endpoint):
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
export OPENAI_API_KEY=... # your gateway key
|
|
59
|
+
export OPENAI_BASE_URL=https://.../v1 # your endpoint
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Use it
|
|
63
|
+
|
|
64
|
+
**As a server** (drop-in for any OpenAI client):
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
switchboard serve # http://localhost:8000/v1 (use --port to change)
|
|
68
|
+
```
|
|
69
|
+
```python
|
|
70
|
+
from openai import OpenAI
|
|
71
|
+
client = OpenAI(base_url="http://localhost:8000/v1", api_key="anything")
|
|
72
|
+
r = client.chat.completions.create(model="router", messages=[{"role": "user", "content": "Hi"}])
|
|
73
|
+
print(r.model_extra["switchboard"]) # route, cost, savings telemetry
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
**As a library:**
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import asyncio
|
|
80
|
+
from switchboard import Engine
|
|
81
|
+
|
|
82
|
+
async def main():
|
|
83
|
+
eng = Engine()
|
|
84
|
+
rr = await eng.answer([{"role": "user", "content": "What is 17 * 23?"}], mode="cost")
|
|
85
|
+
print(rr.content, f"${rr.cost:.6f}", f"{rr.savings_pct:.0f}% cheaper than Opus")
|
|
86
|
+
await eng.aclose()
|
|
87
|
+
|
|
88
|
+
asyncio.run(main())
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
**From the CLI:**
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
switchboard ask "Prove sqrt(2) is irrational" --mode quality
|
|
95
|
+
switchboard models # probe which gateway models are actually live
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## The honest thesis (read this first)
|
|
101
|
+
|
|
102
|
+
The goal is a router that is **cheaper than a frontier model (e.g. Opus) and
|
|
103
|
+
matches-or-beats it on benchmarks**. That is achievable — but only as a
|
|
104
|
+
**portfolio result over a realistic workload**, not a per-query miracle. The iron
|
|
105
|
+
law:
|
|
106
|
+
|
|
107
|
+
> On a *single hard query*, you cannot both beat the frontier model **and** be
|
|
108
|
+
> cheaper than it on that same query.
|
|
109
|
+
|
|
110
|
+
What you *can* do, and what this does:
|
|
111
|
+
|
|
112
|
+
| Traffic | What the router does | Outcome |
|
|
113
|
+
|---|---|---|
|
|
114
|
+
| **Easy queries** (most real traffic) | route to a cheap model | quality ties Opus, **5–50× cheaper** |
|
|
115
|
+
| **Hard queries** (the minority) | **Mixture-of-Agents**: several cheap/mid models answer in parallel, a synthesizer fuses them | quality can **match or exceed** a single Opus call, still **< Opus cost** |
|
|
116
|
+
| **Repeats** | exact-match cache | **free** |
|
|
117
|
+
|
|
118
|
+
Averaged over the workload, total spend is well below always-Opus and mean
|
|
119
|
+
accuracy is **equal-or-better**. Grounded in **RouteLLM**, **FrugalGPT** (cascade
|
|
120
|
+
with a judge), and **Mixture-of-Agents**.
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Modes
|
|
125
|
+
|
|
126
|
+
Pick the strategy via the `model` field:
|
|
127
|
+
|
|
128
|
+
| `model` | strategy |
|
|
129
|
+
|---|---|
|
|
130
|
+
| `router` / `router-balanced` | triage → single cheap (easy) / single mid (moderate) / Mixture-of-Agents (hard) |
|
|
131
|
+
| `router-cost` | **FrugalGPT cascade** — answer cheap, a judge scores it, escalate only if low |
|
|
132
|
+
| `router-quality` | bias one tier up — best quality while staying under Opus cost |
|
|
133
|
+
|
|
134
|
+
Any **real** model id (`claude-opus-4-8`, `gpt-5.5`, …) passes straight through, so
|
|
135
|
+
this also works as a plain multi-provider proxy.
|
|
136
|
+
|
|
137
|
+
## How it works
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
request ─► [cache] ─► [triage: how hard?] ─► [policy] ──► single cheap model (easy)
|
|
141
|
+
└─► single mid model (moderate)
|
|
142
|
+
└─► Mixture-of-Agents (hard)
|
|
143
|
+
proposers ∥ ─► synthesizer
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
- **Triage** (`src/switchboard/classify.py`) — free heuristics (length, code/math
|
|
147
|
+
markers, multi-step verbs) decide obvious cases; a tiny LLM classifier scores the
|
|
148
|
+
ambiguous middle. Output: difficulty 1–5 → tier.
|
|
149
|
+
- **Policy / execution** (`src/switchboard/engine.py`) — `single`, `moa` (parallel
|
|
150
|
+
proposers + synthesizer), or `cascade` (cheap → judge → escalate).
|
|
151
|
+
- **Cost accounting** — every response carries its internal cost, an estimate of
|
|
152
|
+
what always-Opus would have cost, and the savings %, under a `switchboard` key.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Results
|
|
157
|
+
|
|
158
|
+
On **GSM8K (50 items, exact numeric grading)**, baseline = always `claude-opus-4-8`:
|
|
159
|
+
|
|
160
|
+
| config | accuracy | total cost | vs Opus |
|
|
161
|
+
|---|---|---|---|
|
|
162
|
+
| always-Opus | 100.0% | $0.3674 | baseline |
|
|
163
|
+
| `router-cost` | **100.0%** | $0.0064 | **57× cheaper — Pareto win** |
|
|
164
|
+
| `router-quality` | 100.0% | $0.2781 | 1.3× cheaper |
|
|
165
|
+
| `router-balanced` | 92.0% | $0.0611 | 6× cheaper but lost accuracy |
|
|
166
|
+
|
|
167
|
+
Reproduce: `python -m bench.run_gsm8k --n 50 --seed 0`. Full write-up and honest
|
|
168
|
+
caveats in [`RESULTS.md`](RESULTS.md). (The verifier is what makes routing safe —
|
|
169
|
+
`router-balanced` has none and lost 8 points; `router-cost`'s judge is the fix.)
|
|
170
|
+
|
|
171
|
+
---
|
|
172
|
+
|
|
173
|
+
## Limitations & next steps
|
|
174
|
+
|
|
175
|
+
- **Pricing is a list-price proxy** (`src/switchboard/config.py`). Drop your real
|
|
176
|
+
rate card into `pricing.json` (`{"model": [in_per_1M, out_per_1M]}`) to override.
|
|
177
|
+
- **Triage under-detects "deceptively simple" trap questions** — `router-cost`/
|
|
178
|
+
`router-quality` compensate via the judge/MoA.
|
|
179
|
+
- **Streaming is simulated** (full answer computed, then chunked) — MoA can't
|
|
180
|
+
token-stream; only the single-model path could truly stream.
|
|
181
|
+
- **Semantic cache** (embed prompt → nearest neighbour) is not yet wired.
|
|
182
|
+
- **The gateway's `/v1/models` list may be stale** — trust `switchboard models`.
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
switchboard/__init__.py,sha256=fMa5XVm2tqaCzHbI2_R4Xyuj_OFOoD-qi-BgvVcTvk8,1761
|
|
2
|
+
switchboard/cache.py,sha256=5AOK4Vv_qsAwCJ8DSzsrO3kNW-W55DYnH8Q2NyEsVq8,1976
|
|
3
|
+
switchboard/classify.py,sha256=5iMqEFEXaOmmqtZXwh8OZcPkqFzz6sOXSGx58SGY3ac,4883
|
|
4
|
+
switchboard/cli.py,sha256=p7owaGcmueY9yJdnheeiFh93yxVK8gjy8g-zBGt9UFs,3526
|
|
5
|
+
switchboard/config.py,sha256=kqZ-odX7lMWldcp1nc9NAmulREfmqBInMrUB0COtPrY,4509
|
|
6
|
+
switchboard/engine.py,sha256=tmyyrhyMSOQKVcyVqk1XusI_suDEXutS5OTnRQpTo58,11208
|
|
7
|
+
switchboard/gateway.py,sha256=7EmH5drldQ0DSG_SInZdULUg1GF1NeBtuNPHL3WycHg,4442
|
|
8
|
+
switchboard/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
switchboard/server.py,sha256=C9PPSwomrphRpL5AXxgQNtr_CWbZ2I_Fhfq3YSk4Gwk,6210
|
|
10
|
+
switchboard_llm-0.1.0.dist-info/METADATA,sha256=nm2oqTFM27pFwf0MckcbuzDVvtxuHs4zUFkqrZ8qxpI,7307
|
|
11
|
+
switchboard_llm-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
switchboard_llm-0.1.0.dist-info/entry_points.txt,sha256=r2X0ZfWK4hPbr1jjFGW-ODJN8-unQVhix3OTC0O3XUA,53
|
|
13
|
+
switchboard_llm-0.1.0.dist-info/licenses/LICENSE,sha256=tys42fCgdQASPWxptC0DasjfZgUdVxXko8tGZ6K4ZOE,1071
|
|
14
|
+
switchboard_llm-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Archit Dwivedi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|