route67 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_router/__init__.py +8 -0
- llm_router/config.py +61 -0
- llm_router/controller.py +101 -0
- llm_router/embedder.py +42 -0
- llm_router/escalation.py +248 -0
- llm_router/logging_utils.py +34 -0
- llm_router/prompts.py +52 -0
- llm_router/routing_table.py +101 -0
- route67-0.1.0.dist-info/METADATA +220 -0
- route67-0.1.0.dist-info/RECORD +12 -0
- route67-0.1.0.dist-info/WHEEL +4 -0
- route67-0.1.0.dist-info/licenses/LICENSE +21 -0
llm_router/__init__.py
ADDED
llm_router/config.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Configuration models for the router."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
ModelTarget = Literal["weak_model", "strong_model"]
|
|
9
|
+
MODEL_TARGETS = frozenset({"weak_model", "strong_model"})
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class RoutingTableEntry:
|
|
14
|
+
query: str
|
|
15
|
+
target: ModelTarget
|
|
16
|
+
notes: str | None = None
|
|
17
|
+
|
|
18
|
+
def __post_init__(self) -> None:
|
|
19
|
+
if self.target not in MODEL_TARGETS:
|
|
20
|
+
raise ValueError(
|
|
21
|
+
"routing target must be 'weak_model' or 'strong_model'"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass(frozen=True, slots=True)
|
|
26
|
+
class ModelSpec:
|
|
27
|
+
name: str
|
|
28
|
+
usage_notes: str | None = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(slots=True)
|
|
32
|
+
class RouterConfig:
|
|
33
|
+
routing_table: list[RoutingTableEntry] = field(default_factory=list)
|
|
34
|
+
similarity_threshold: float = 0.75
|
|
35
|
+
weak_model: ModelSpec | None = None
|
|
36
|
+
strong_model: ModelSpec | None = None
|
|
37
|
+
embedding_cache_path: str | None = None
|
|
38
|
+
log_path: str | None = None
|
|
39
|
+
escalation_max_tokens: int = 10
|
|
40
|
+
embedding_model: str = "minishlab/potion-base-8M"
|
|
41
|
+
|
|
42
|
+
def __post_init__(self) -> None:
|
|
43
|
+
if not -1.0 <= self.similarity_threshold <= 1.0:
|
|
44
|
+
raise ValueError("similarity_threshold must be between -1.0 and 1.0")
|
|
45
|
+
if self.weak_model is None:
|
|
46
|
+
raise ValueError("weak_model is required")
|
|
47
|
+
if self.strong_model is None:
|
|
48
|
+
raise ValueError("strong_model is required")
|
|
49
|
+
if self.escalation_max_tokens < 1:
|
|
50
|
+
raise ValueError("escalation_max_tokens must be at least 1")
|
|
51
|
+
|
|
52
|
+
def resolve_target(self, target: ModelTarget) -> ModelSpec:
|
|
53
|
+
if target == "weak_model":
|
|
54
|
+
if self.weak_model is None:
|
|
55
|
+
raise RuntimeError("weak_model is not configured")
|
|
56
|
+
return self.weak_model
|
|
57
|
+
if target == "strong_model":
|
|
58
|
+
if self.strong_model is None:
|
|
59
|
+
raise RuntimeError("strong_model is not configured")
|
|
60
|
+
return self.strong_model
|
|
61
|
+
raise ValueError("routing target must be 'weak_model' or 'strong_model'")
|
llm_router/controller.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""OpenAI-compatible public controller."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from .config import RouterConfig
|
|
8
|
+
from .embedder import Embedder
|
|
9
|
+
from .escalation import run_with_escalation
|
|
10
|
+
from .logging_utils import RoutingDecision, log_decision
|
|
11
|
+
from .routing_table import RoutingTable
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class Controller:
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
config: RouterConfig,
|
|
18
|
+
openai_client: Any | None = None,
|
|
19
|
+
embedder: Embedder | None = None,
|
|
20
|
+
) -> None:
|
|
21
|
+
self.config = config
|
|
22
|
+
self.client = openai_client or _default_openai_client()
|
|
23
|
+
self.table = RoutingTable(
|
|
24
|
+
config.routing_table,
|
|
25
|
+
embedder or Embedder(config.embedding_model),
|
|
26
|
+
config.embedding_cache_path,
|
|
27
|
+
)
|
|
28
|
+
self.chat = _ChatProxy(self)
|
|
29
|
+
|
|
30
|
+
def chat_completions_create(self, **kwargs: Any) -> Any:
|
|
31
|
+
if kwargs.get("stream"):
|
|
32
|
+
raise NotImplementedError("Public streaming is not supported in route67 v1")
|
|
33
|
+
messages = kwargs.get("messages")
|
|
34
|
+
if not isinstance(messages, list):
|
|
35
|
+
raise TypeError("messages must be provided as a list")
|
|
36
|
+
|
|
37
|
+
query = extract_user_query(messages)
|
|
38
|
+
entry, score = self.table.best_match(query)
|
|
39
|
+
forwarded = {key: value for key, value in kwargs.items() if key != "model"}
|
|
40
|
+
|
|
41
|
+
if entry is not None and score >= self.config.similarity_threshold:
|
|
42
|
+
selected_model = self.config.resolve_target(entry.target)
|
|
43
|
+
response = self.client.chat.completions.create(
|
|
44
|
+
model=selected_model.name,
|
|
45
|
+
**forwarded,
|
|
46
|
+
)
|
|
47
|
+
decision = RoutingDecision("table_match", selected_model.name, score)
|
|
48
|
+
else:
|
|
49
|
+
result = run_with_escalation(
|
|
50
|
+
self.client,
|
|
51
|
+
self.config,
|
|
52
|
+
messages,
|
|
53
|
+
request_kwargs=forwarded,
|
|
54
|
+
)
|
|
55
|
+
response = result.response
|
|
56
|
+
decision = RoutingDecision(
|
|
57
|
+
"escalated" if result.escalated else "weak_model_direct",
|
|
58
|
+
result.used_model,
|
|
59
|
+
score,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
log_decision(self.config.log_path, query, decision)
|
|
63
|
+
return response
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def extract_user_query(messages: list[dict[str, Any]]) -> str:
|
|
67
|
+
for message in reversed(messages):
|
|
68
|
+
if message.get("role") != "user":
|
|
69
|
+
continue
|
|
70
|
+
content = message.get("content", "")
|
|
71
|
+
if isinstance(content, str):
|
|
72
|
+
return content
|
|
73
|
+
if isinstance(content, list):
|
|
74
|
+
return " ".join(
|
|
75
|
+
part.get("text", "")
|
|
76
|
+
for part in content
|
|
77
|
+
if isinstance(part, dict) and part.get("type") == "text"
|
|
78
|
+
)
|
|
79
|
+
return str(content)
|
|
80
|
+
raise ValueError("messages must contain at least one user message")
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _default_openai_client() -> Any:
|
|
84
|
+
try:
|
|
85
|
+
from openai import OpenAI
|
|
86
|
+
except ImportError as exc:
|
|
87
|
+
raise ImportError("openai is required when openai_client is not supplied") from exc
|
|
88
|
+
return OpenAI()
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class _ChatProxy:
|
|
92
|
+
def __init__(self, controller: Controller) -> None:
|
|
93
|
+
self.completions = _CompletionsProxy(controller)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class _CompletionsProxy:
|
|
97
|
+
def __init__(self, controller: Controller) -> None:
|
|
98
|
+
self._controller = controller
|
|
99
|
+
|
|
100
|
+
def create(self, **kwargs: Any) -> Any:
|
|
101
|
+
return self._controller.chat_completions_create(**kwargs)
|
llm_router/embedder.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Small, lazy-loading Model2Vec embedder."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Embedder:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
model_name: str = "minishlab/potion-base-8M",
|
|
15
|
+
model: Any | None = None,
|
|
16
|
+
) -> None:
|
|
17
|
+
self.model_name = model_name
|
|
18
|
+
self._model = model
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def model(self) -> Any:
|
|
22
|
+
if self._model is None:
|
|
23
|
+
try:
|
|
24
|
+
from model2vec import StaticModel
|
|
25
|
+
except ImportError as exc:
|
|
26
|
+
raise ImportError(
|
|
27
|
+
"model2vec is required to compute embeddings; install route67"
|
|
28
|
+
) from exc
|
|
29
|
+
self._model = StaticModel.from_pretrained(self.model_name)
|
|
30
|
+
return self._model
|
|
31
|
+
|
|
32
|
+
def encode(self, texts: Sequence[str]) -> np.ndarray:
|
|
33
|
+
if isinstance(texts, str):
|
|
34
|
+
raise TypeError("encode expects a sequence of strings; use encode_one for one string")
|
|
35
|
+
vectors = np.asarray(self.model.encode(list(texts)), dtype=np.float32)
|
|
36
|
+
if vectors.ndim != 2:
|
|
37
|
+
raise ValueError("embedder returned an array with an unexpected shape")
|
|
38
|
+
return vectors
|
|
39
|
+
|
|
40
|
+
def encode_one(self, text: str) -> np.ndarray:
|
|
41
|
+
return self.encode([text])[0]
|
|
42
|
+
|
llm_router/escalation.py
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""Sequential weak-to-strong escalation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from types import SimpleNamespace
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from .config import RouterConfig
|
|
11
|
+
from .prompts import build_escalation_prompt
|
|
12
|
+
|
|
13
|
+
SENTINEL = "ESCALATE"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class EscalationResult:
|
|
18
|
+
used_model: str
|
|
19
|
+
response: Any
|
|
20
|
+
escalated: bool
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run_with_escalation(
|
|
24
|
+
client: Any,
|
|
25
|
+
config: RouterConfig,
|
|
26
|
+
messages: list[dict[str, Any]],
|
|
27
|
+
request_kwargs: dict[str, Any] | None = None,
|
|
28
|
+
) -> EscalationResult:
|
|
29
|
+
request_kwargs = dict(request_kwargs or {})
|
|
30
|
+
request_kwargs.pop("model", None)
|
|
31
|
+
request_kwargs.pop("messages", None)
|
|
32
|
+
request_kwargs.pop("stream", None)
|
|
33
|
+
|
|
34
|
+
prompt = build_escalation_prompt(
|
|
35
|
+
config.weak_model.usage_notes,
|
|
36
|
+
config.strong_model,
|
|
37
|
+
config.routing_table,
|
|
38
|
+
)
|
|
39
|
+
weak_messages = [{"role": "system", "content": prompt}, *messages]
|
|
40
|
+
weak_stream = client.chat.completions.create(
|
|
41
|
+
model=config.weak_model.name,
|
|
42
|
+
messages=weak_messages,
|
|
43
|
+
stream=True,
|
|
44
|
+
**request_kwargs,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
chunks: list[Any] = []
|
|
48
|
+
preview = ""
|
|
49
|
+
decision_made = False
|
|
50
|
+
try:
|
|
51
|
+
for chunk in weak_stream:
|
|
52
|
+
chunks.append(chunk)
|
|
53
|
+
preview += _chunk_text(chunk)
|
|
54
|
+
if not decision_made and _decision_boundary_reached(
|
|
55
|
+
preview, config.escalation_max_tokens
|
|
56
|
+
):
|
|
57
|
+
decision_made = True
|
|
58
|
+
if _is_escalation(preview):
|
|
59
|
+
_close_stream(weak_stream)
|
|
60
|
+
strong_response = client.chat.completions.create(
|
|
61
|
+
model=config.strong_model.name,
|
|
62
|
+
messages=messages,
|
|
63
|
+
stream=False,
|
|
64
|
+
**request_kwargs,
|
|
65
|
+
)
|
|
66
|
+
return EscalationResult(
|
|
67
|
+
used_model=config.strong_model.name,
|
|
68
|
+
response=strong_response,
|
|
69
|
+
escalated=True,
|
|
70
|
+
)
|
|
71
|
+
finally:
|
|
72
|
+
_close_stream(weak_stream)
|
|
73
|
+
|
|
74
|
+
if _is_escalation(preview):
|
|
75
|
+
strong_response = client.chat.completions.create(
|
|
76
|
+
model=config.strong_model.name,
|
|
77
|
+
messages=messages,
|
|
78
|
+
stream=False,
|
|
79
|
+
**request_kwargs,
|
|
80
|
+
)
|
|
81
|
+
return EscalationResult(config.strong_model.name, strong_response, True)
|
|
82
|
+
|
|
83
|
+
response = _assemble_chat_completion(chunks, config.weak_model.name)
|
|
84
|
+
return EscalationResult(config.weak_model.name, response, False)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _decision_boundary_reached(text: str, max_tokens: int) -> bool:
|
|
88
|
+
stripped = text.lstrip()
|
|
89
|
+
if "\n" in stripped or "\r" in stripped:
|
|
90
|
+
return True
|
|
91
|
+
if stripped.lower().startswith(SENTINEL.lower()) and len(stripped) >= len(SENTINEL):
|
|
92
|
+
return True
|
|
93
|
+
if stripped and not SENTINEL.lower().startswith(stripped.lower()):
|
|
94
|
+
return True
|
|
95
|
+
return len(stripped.split()) >= max_tokens
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _is_escalation(text: str) -> bool:
|
|
99
|
+
return text.lstrip().lower().startswith(SENTINEL.lower())
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _chunk_text(chunk: Any) -> str:
|
|
103
|
+
choices = _get(chunk, "choices", []) or []
|
|
104
|
+
if not choices:
|
|
105
|
+
return ""
|
|
106
|
+
delta = _get(choices[0], "delta")
|
|
107
|
+
return _get(delta, "content", "") or ""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _close_stream(stream: Any) -> None:
|
|
111
|
+
close = getattr(stream, "close", None)
|
|
112
|
+
if callable(close):
|
|
113
|
+
close()
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _assemble_chat_completion(chunks: list[Any], model: str) -> Any:
|
|
117
|
+
message: dict[str, Any] = {"role": "assistant", "content": ""}
|
|
118
|
+
response_extensions: dict[str, Any] = {}
|
|
119
|
+
choice_extensions: dict[str, Any] = {}
|
|
120
|
+
finish_reason = "stop"
|
|
121
|
+
completion_id = "route67-weak"
|
|
122
|
+
created = int(time.time())
|
|
123
|
+
system_fingerprint = None
|
|
124
|
+
usage = None
|
|
125
|
+
|
|
126
|
+
for chunk in chunks:
|
|
127
|
+
chunk_payload = _dump(chunk)
|
|
128
|
+
if isinstance(chunk_payload, dict):
|
|
129
|
+
for key, value in chunk_payload.items():
|
|
130
|
+
if key not in {
|
|
131
|
+
"id",
|
|
132
|
+
"object",
|
|
133
|
+
"created",
|
|
134
|
+
"model",
|
|
135
|
+
"choices",
|
|
136
|
+
"usage",
|
|
137
|
+
"system_fingerprint",
|
|
138
|
+
} and value is not None:
|
|
139
|
+
response_extensions[key] = value
|
|
140
|
+
completion_id = _get(chunk, "id", completion_id) or completion_id
|
|
141
|
+
created = _get(chunk, "created", created) or created
|
|
142
|
+
system_fingerprint = _get(chunk, "system_fingerprint", system_fingerprint)
|
|
143
|
+
usage = _get(chunk, "usage", usage)
|
|
144
|
+
choices = _get(chunk, "choices", []) or []
|
|
145
|
+
if not choices:
|
|
146
|
+
continue
|
|
147
|
+
choice = choices[0]
|
|
148
|
+
delta = _get(choice, "delta")
|
|
149
|
+
delta_payload = _dump(delta)
|
|
150
|
+
if isinstance(delta_payload, dict):
|
|
151
|
+
message = _merge_stream_value(message, delta_payload)
|
|
152
|
+
choice_payload = _dump(choice)
|
|
153
|
+
if isinstance(choice_payload, dict):
|
|
154
|
+
for key, value in choice_payload.items():
|
|
155
|
+
if key not in {"index", "delta", "finish_reason"} and value is not None:
|
|
156
|
+
choice_extensions[key] = value
|
|
157
|
+
finish_reason = _get(choice, "finish_reason", finish_reason) or finish_reason
|
|
158
|
+
|
|
159
|
+
payload = {
|
|
160
|
+
"id": completion_id,
|
|
161
|
+
"object": "chat.completion",
|
|
162
|
+
"created": created,
|
|
163
|
+
"model": model,
|
|
164
|
+
"choices": [
|
|
165
|
+
{
|
|
166
|
+
"index": 0,
|
|
167
|
+
"message": message,
|
|
168
|
+
"finish_reason": finish_reason,
|
|
169
|
+
"logprobs": None,
|
|
170
|
+
**choice_extensions,
|
|
171
|
+
}
|
|
172
|
+
],
|
|
173
|
+
"usage": _dump(usage) if usage is not None else None,
|
|
174
|
+
"system_fingerprint": system_fingerprint,
|
|
175
|
+
**response_extensions,
|
|
176
|
+
}
|
|
177
|
+
try:
|
|
178
|
+
from openai.types.chat import ChatCompletion
|
|
179
|
+
|
|
180
|
+
return ChatCompletion.model_validate(payload)
|
|
181
|
+
except (ImportError, TypeError, ValueError):
|
|
182
|
+
# Compatible providers can add values before the OpenAI SDK schema knows
|
|
183
|
+
# about them. Preserve the response shape instead of rejecting the data.
|
|
184
|
+
return _namespace(payload)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def _get(value: Any, name: str, default: Any = None) -> Any:
|
|
188
|
+
if isinstance(value, dict):
|
|
189
|
+
return value.get(name, default)
|
|
190
|
+
return getattr(value, name, default)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _dump(value: Any) -> Any:
|
|
194
|
+
if isinstance(value, dict):
|
|
195
|
+
return value
|
|
196
|
+
model_dump = getattr(value, "model_dump", None)
|
|
197
|
+
if callable(model_dump):
|
|
198
|
+
return model_dump(exclude_none=True)
|
|
199
|
+
attributes = getattr(value, "__dict__", None)
|
|
200
|
+
return dict(attributes) if isinstance(attributes, dict) else value
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _merge_stream_value(current: Any, incoming: Any, key: str | None = None) -> Any:
|
|
204
|
+
"""Merge streamed OpenAI-format deltas without dropping provider extensions."""
|
|
205
|
+
if incoming is None:
|
|
206
|
+
return current
|
|
207
|
+
if current is None:
|
|
208
|
+
return incoming
|
|
209
|
+
if isinstance(current, dict) and isinstance(incoming, dict):
|
|
210
|
+
merged = dict(current)
|
|
211
|
+
for child_key, value in incoming.items():
|
|
212
|
+
merged[child_key] = _merge_stream_value(
|
|
213
|
+
merged.get(child_key), value, child_key
|
|
214
|
+
)
|
|
215
|
+
return merged
|
|
216
|
+
if isinstance(current, list) and isinstance(incoming, list):
|
|
217
|
+
return _merge_stream_lists(current, incoming)
|
|
218
|
+
if isinstance(current, str) and isinstance(incoming, str):
|
|
219
|
+
if key in {"role", "name"}:
|
|
220
|
+
return incoming
|
|
221
|
+
return current + incoming
|
|
222
|
+
return incoming
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _merge_stream_lists(current: list[Any], incoming: list[Any]) -> list[Any]:
|
|
226
|
+
merged = list(current)
|
|
227
|
+
positions = {
|
|
228
|
+
item.get("index"): position
|
|
229
|
+
for position, item in enumerate(merged)
|
|
230
|
+
if isinstance(item, dict) and item.get("index") is not None
|
|
231
|
+
}
|
|
232
|
+
for item in incoming:
|
|
233
|
+
if isinstance(item, dict) and item.get("index") in positions:
|
|
234
|
+
position = positions[item["index"]]
|
|
235
|
+
merged[position] = _merge_stream_value(merged[position], item)
|
|
236
|
+
else:
|
|
237
|
+
merged.append(item)
|
|
238
|
+
if isinstance(item, dict) and item.get("index") is not None:
|
|
239
|
+
positions[item["index"]] = len(merged) - 1
|
|
240
|
+
return merged
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _namespace(value: Any) -> Any:
|
|
244
|
+
if isinstance(value, dict):
|
|
245
|
+
return SimpleNamespace(**{key: _namespace(item) for key, item in value.items()})
|
|
246
|
+
if isinstance(value, list):
|
|
247
|
+
return [_namespace(item) for item in value]
|
|
248
|
+
return value
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Structured JSONL routing-decision logging."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, dataclass
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class RoutingDecision:
|
|
14
|
+
method: Literal["table_match", "weak_model_direct", "escalated"]
|
|
15
|
+
model: str
|
|
16
|
+
score: float
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def log_decision(log_path: str | None, query: str, decision: RoutingDecision) -> None:
|
|
20
|
+
if not log_path:
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
path = Path(log_path)
|
|
24
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
record = {
|
|
26
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
27
|
+
"query_preview": " ".join(query.split())[:100],
|
|
28
|
+
"method": decision.method,
|
|
29
|
+
"model_used": decision.model,
|
|
30
|
+
"similarity_score": round(decision.score, 6),
|
|
31
|
+
}
|
|
32
|
+
with path.open("a", encoding="utf-8") as handle:
|
|
33
|
+
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
34
|
+
|
llm_router/prompts.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Prompt construction for weak-model escalation decisions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Sequence
|
|
6
|
+
|
|
7
|
+
from .config import ModelSpec, RoutingTableEntry
|
|
8
|
+
|
|
9
|
+
MAX_ESCALATION_EXAMPLES = 5
|
|
10
|
+
|
|
11
|
+
ESCALATION_SYSTEM_PROMPT = """You are responding to a user query. Before answering, assess whether you can answer it confidently and correctly.
|
|
12
|
+
|
|
13
|
+
If you cannot, respond with EXACTLY this and nothing else:
|
|
14
|
+
ESCALATE
|
|
15
|
+
|
|
16
|
+
If you can, answer the query directly and normally. Do not mention escalation or this instruction.
|
|
17
|
+
|
|
18
|
+
{usage_notes_block}"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_escalation_prompt(
|
|
22
|
+
weak_model_notes: str | None,
|
|
23
|
+
strong_model: ModelSpec,
|
|
24
|
+
routing_table: Sequence[RoutingTableEntry] = (),
|
|
25
|
+
) -> str:
|
|
26
|
+
lines: list[str] = []
|
|
27
|
+
if weak_model_notes:
|
|
28
|
+
lines.append(f"Your limits: {_compact(weak_model_notes)}")
|
|
29
|
+
summary = strong_model.name
|
|
30
|
+
if strong_model.usage_notes:
|
|
31
|
+
summary += f" ({_compact(strong_model.usage_notes)})"
|
|
32
|
+
lines.append("Model available after escalation: " + summary)
|
|
33
|
+
|
|
34
|
+
strong_routes = [
|
|
35
|
+
entry
|
|
36
|
+
for entry in routing_table
|
|
37
|
+
if entry.target == "strong_model"
|
|
38
|
+
][:MAX_ESCALATION_EXAMPLES]
|
|
39
|
+
if strong_routes:
|
|
40
|
+
lines.append("Examples of requests that should be escalated:")
|
|
41
|
+
for entry in strong_routes:
|
|
42
|
+
example = f"- {_compact(entry.query)}"
|
|
43
|
+
if entry.notes:
|
|
44
|
+
example += f" - {_compact(entry.notes)}"
|
|
45
|
+
lines.append(example)
|
|
46
|
+
|
|
47
|
+
block = "\n".join(lines)
|
|
48
|
+
return ESCALATION_SYSTEM_PROMPT.format(usage_notes_block=block).rstrip()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _compact(value: str) -> str:
|
|
52
|
+
return " ".join(value.split())
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""In-memory semantic routing table with semantic search and an optional disk cache."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from dataclasses import asdict
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
from .config import RoutingTableEntry
|
|
13
|
+
from .embedder import Embedder
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RoutingTable:
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
entries: list[RoutingTableEntry],
|
|
20
|
+
embedder: Embedder,
|
|
21
|
+
cache_path: str | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self.entries = list(entries)
|
|
24
|
+
self.embedder = embedder
|
|
25
|
+
self.cache_path = cache_path
|
|
26
|
+
self.embeddings = np.empty((0, 0), dtype=np.float32)
|
|
27
|
+
self.load_or_build()
|
|
28
|
+
|
|
29
|
+
def load_or_build(self) -> None:
|
|
30
|
+
if not self.entries:
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
table_hash = self._table_hash()
|
|
34
|
+
vector_path, metadata_path = self._cache_paths()
|
|
35
|
+
if vector_path and metadata_path and vector_path.exists() and metadata_path.exists():
|
|
36
|
+
try:
|
|
37
|
+
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
|
38
|
+
if (
|
|
39
|
+
metadata.get("table_hash") == table_hash
|
|
40
|
+
and metadata.get("embedding_model") == self.embedder.model_name
|
|
41
|
+
):
|
|
42
|
+
cached = np.load(vector_path, allow_pickle=False)
|
|
43
|
+
if cached.ndim == 2 and cached.shape[0] == len(self.entries):
|
|
44
|
+
self.embeddings = cached.astype(np.float32, copy=False)
|
|
45
|
+
return
|
|
46
|
+
except (OSError, ValueError, json.JSONDecodeError):
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
vectors = self.embedder.encode([entry.query for entry in self.entries])
|
|
50
|
+
self.embeddings = _normalize_rows(vectors)
|
|
51
|
+
|
|
52
|
+
if vector_path and metadata_path:
|
|
53
|
+
vector_path.parent.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
np.save(vector_path, self.embeddings, allow_pickle=False)
|
|
55
|
+
metadata_path.write_text(
|
|
56
|
+
json.dumps(
|
|
57
|
+
{
|
|
58
|
+
"table_hash": table_hash,
|
|
59
|
+
"embedding_model": self.embedder.model_name,
|
|
60
|
+
"entries": [asdict(entry) for entry in self.entries],
|
|
61
|
+
},
|
|
62
|
+
ensure_ascii=False,
|
|
63
|
+
indent=2,
|
|
64
|
+
),
|
|
65
|
+
encoding="utf-8",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def best_match(self, query: str) -> tuple[RoutingTableEntry | None, float]:
|
|
69
|
+
if not self.entries:
|
|
70
|
+
return None, 0.0
|
|
71
|
+
|
|
72
|
+
query_vector = _normalize_rows(self.embedder.encode([query]))[0]
|
|
73
|
+
scores = self.embeddings @ query_vector
|
|
74
|
+
best_index = int(np.argmax(scores))
|
|
75
|
+
return self.entries[best_index], float(scores[best_index])
|
|
76
|
+
|
|
77
|
+
def _table_hash(self) -> str:
|
|
78
|
+
payload = json.dumps(
|
|
79
|
+
[asdict(entry) for entry in self.entries],
|
|
80
|
+
sort_keys=True,
|
|
81
|
+
ensure_ascii=False,
|
|
82
|
+
separators=(",", ":"),
|
|
83
|
+
)
|
|
84
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
85
|
+
|
|
86
|
+
def _cache_paths(self) -> tuple[Path | None, Path | None]:
|
|
87
|
+
if not self.cache_path:
|
|
88
|
+
return None, None
|
|
89
|
+
base = Path(self.cache_path)
|
|
90
|
+
if base.suffix in {".npy", ".json"}:
|
|
91
|
+
base = base.with_suffix("")
|
|
92
|
+
return base.with_suffix(".npy"), base.with_suffix(".json")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _normalize_rows(vectors: np.ndarray) -> np.ndarray:
|
|
96
|
+
vectors = np.asarray(vectors, dtype=np.float32)
|
|
97
|
+
if vectors.ndim != 2:
|
|
98
|
+
raise ValueError("embeddings must be a two-dimensional array")
|
|
99
|
+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
100
|
+
return np.divide(vectors, norms, out=np.zeros_like(vectors), where=norms != 0)
|
|
101
|
+
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: route67
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A semantic LLM router for OpenAI-compatible chat completions.
|
|
5
|
+
Project-URL: Homepage, https://github.com/SmallChungus1/route67
|
|
6
|
+
Project-URL: Repository, https://github.com/SmallChungus1/route67
|
|
7
|
+
Project-URL: Issues, https://github.com/SmallChungus1/route67/issues
|
|
8
|
+
Author: route67 contributors
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: llm,openai,router,semantic-routing
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: model2vec<1,>=0.6
|
|
24
|
+
Requires-Dist: numpy>=1.24
|
|
25
|
+
Requires-Dist: openai<3,>=1.0
|
|
26
|
+
Provides-Extra: test
|
|
27
|
+
Requires-Dist: pytest>=8; extra == 'test'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# route67
|
|
31
|
+
|
|
32
|
+
`route67` is a LLM router for OpenAI-compatible chat
|
|
33
|
+
completions format. It uses a user-defined routing table for user defined question-model routing via semantic similarity, as a fallback a weak model answer or explicitly escalate to a strong model.
|
|
34
|
+
|
|
35
|
+
## How it works
|
|
36
|
+
|
|
37
|
+
```mermaid
|
|
38
|
+
flowchart LR
|
|
39
|
+
Q["User request"] --> R{"Semantic route match?"}
|
|
40
|
+
R -- Yes --> M["Configured weak or strong model"]
|
|
41
|
+
R -- No --> W["Weak model gate<br/>usage notes + strong-route examples"]
|
|
42
|
+
W -- Answers --> O["Response"]
|
|
43
|
+
W -- ESCALATE --> S["Strong model"]
|
|
44
|
+
M --> O
|
|
45
|
+
S --> O
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
route67 requires Python 3.10 or newer. Choose either the standard Python workflow
|
|
51
|
+
or the `uv` workflow.
|
|
52
|
+
|
|
53
|
+
### Using `python -m venv`
|
|
54
|
+
|
|
55
|
+
Create and activate a virtual environment:
|
|
56
|
+
|
|
57
|
+
```console
|
|
58
|
+
python -m venv .venv
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
```powershell
|
|
62
|
+
# Windows PowerShell
|
|
63
|
+
.\.venv\Scripts\Activate.ps1
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
```console
|
|
67
|
+
# macOS/Linux
|
|
68
|
+
source .venv/bin/activate
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Then install route67 and its dependencies:
|
|
72
|
+
|
|
73
|
+
```console
|
|
74
|
+
python -m pip install --upgrade pip
|
|
75
|
+
python -m pip install -e .
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
To also install the test dependencies, use `python -m pip install -e ".[test]"`.
|
|
79
|
+
|
|
80
|
+
### Using `uv`
|
|
81
|
+
|
|
82
|
+
With [`uv`](https://docs.astral.sh/uv/) installed, create the environment and
|
|
83
|
+
install the project from the lockfile:
|
|
84
|
+
|
|
85
|
+
```console
|
|
86
|
+
uv sync
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Run commands inside the environment with `uv run`, for example
|
|
90
|
+
`uv run python example.py`. To include test dependencies, use
|
|
91
|
+
`uv sync --extra test`.
|
|
92
|
+
|
|
93
|
+
## Get started
|
|
94
|
+
|
|
95
|
+
Set an OpenAI API key in your environment:
|
|
96
|
+
|
|
97
|
+
```powershell
|
|
98
|
+
# Windows PowerShell
|
|
99
|
+
$env:OPENAI_API_KEY = "your-api-key"
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
```console
|
|
103
|
+
# macOS/Linux
|
|
104
|
+
export OPENAI_API_KEY="your-api-key"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Create `example.py`:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from llm_router import Controller, ModelSpec, RouterConfig, RoutingTableEntry
|
|
111
|
+
|
|
112
|
+
config = RouterConfig(
|
|
113
|
+
routing_table=[
|
|
114
|
+
RoutingTableEntry(
|
|
115
|
+
"Prove this theorem",
|
|
116
|
+
"strong_model",
|
|
117
|
+
notes="Requires a rigorous multi-step proof.",
|
|
118
|
+
),
|
|
119
|
+
RoutingTableEntry("Rewrite this paragraph", "weak_model"),
|
|
120
|
+
],
|
|
121
|
+
weak_model=ModelSpec(
|
|
122
|
+
"gpt-5-mini",
|
|
123
|
+
usage_notes="Avoid difficult multi-step proofs.",
|
|
124
|
+
),
|
|
125
|
+
strong_model=ModelSpec(
|
|
126
|
+
"gpt-5",
|
|
127
|
+
usage_notes="Use for rigorous proofs and difficult reasoning.",
|
|
128
|
+
),
|
|
129
|
+
embedding_cache_path=".cache/routes",
|
|
130
|
+
log_path=".cache/routing.jsonl",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
client = Controller(config)
|
|
134
|
+
response = client.chat.completions.create(
|
|
135
|
+
messages=[{"role": "user", "content": "Prove that sqrt(2) is irrational."}]
|
|
136
|
+
)
|
|
137
|
+
print(response.choices[0].message.content)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Run it with the activated standard virtual environment:
|
|
141
|
+
|
|
142
|
+
```console
|
|
143
|
+
python example.py
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Or with `uv`:
|
|
147
|
+
|
|
148
|
+
```console
|
|
149
|
+
uv run python example.py
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
### OpenAI-compatible providers
|
|
153
|
+
|
|
154
|
+
route67 can use any provider exposed through an OpenAI-compatible client. Create
|
|
155
|
+
the provider's client normally and inject it into the controller. Model names in
|
|
156
|
+
the routing configuration are passed to that provider unchanged.
|
|
157
|
+
|
|
158
|
+
For example, with OpenRouter:
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
import os
|
|
162
|
+
|
|
163
|
+
from openai import OpenAI
|
|
164
|
+
from llm_router import Controller, ModelSpec, RouterConfig, RoutingTableEntry
|
|
165
|
+
|
|
166
|
+
openrouter = OpenAI(
|
|
167
|
+
base_url="https://openrouter.ai/api/v1",
|
|
168
|
+
api_key=os.environ["OPENROUTER_API_KEY"],
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
config = RouterConfig(
|
|
172
|
+
routing_table=[
|
|
173
|
+
RoutingTableEntry(
|
|
174
|
+
"Answer questions about a country",
|
|
175
|
+
"weak_model",
|
|
176
|
+
),
|
|
177
|
+
RoutingTableEntry(
|
|
178
|
+
"Solve a difficult reasoning or math problem",
|
|
179
|
+
"strong_model",
|
|
180
|
+
notes="Requires careful multi-step reasoning.",
|
|
181
|
+
),
|
|
182
|
+
],
|
|
183
|
+
weak_model=ModelSpec(
|
|
184
|
+
"openai/gpt-4.1-mini",
|
|
185
|
+
usage_notes="Best for straightforward factual and writing questions.",
|
|
186
|
+
),
|
|
187
|
+
strong_model=ModelSpec(
|
|
188
|
+
"deepseek/deepseek-v4-flash",
|
|
189
|
+
usage_notes="Use for difficult reasoning, mathematics, and verification.",
|
|
190
|
+
),
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
client = Controller(config, openai_client=openrouter)
|
|
194
|
+
response = client.chat.completions.create(
|
|
195
|
+
messages=[
|
|
196
|
+
{
|
|
197
|
+
"role": "user",
|
|
198
|
+
"content": "How many r's are in the word 'strawberry'?",
|
|
199
|
+
}
|
|
200
|
+
],
|
|
201
|
+
extra_body={"reasoning": {"enabled": True}},
|
|
202
|
+
)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Provider-specific request options such as `extra_body` and `extra_headers` are
|
|
206
|
+
forwarded unchanged. Provider-specific response fields, including
|
|
207
|
+
`reasoning_details`, are also preserved. To continue a provider's reasoning,
|
|
208
|
+
pass its assistant message fields back unmodified in the next request.
|
|
209
|
+
|
|
210
|
+
Routing table entries target only `"weak_model"` or `"strong_model"`. Provider
|
|
211
|
+
model names live in `ModelSpec`, so switching models or providers does not
|
|
212
|
+
require rewriting the routing table.
|
|
213
|
+
|
|
214
|
+
`ModelSpec.usage_notes` are added to the weak model's escalation system prompt.
|
|
215
|
+
The prompt also includes up to five routing-table entries targeting
|
|
216
|
+
`"strong_model"` as examples of requests that should be escalated. Add concise
|
|
217
|
+
`notes` to those entries when the reason for escalation is useful context.
|
|
218
|
+
|
|
219
|
+
Your first request will download the `minishlab/potion-base-8M` from HuggingFace. The model is lazy-loaded,
|
|
220
|
+
so constructing a controller with an empty routing table does not download it.
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
llm_router/__init__.py,sha256=RwHWX6F0Is8P0MaQTNj7wLZDurk4vHtpZ7L3W6kC6dU,242
|
|
2
|
+
llm_router/config.py,sha256=FxjEeO62qVwVcPtxsvfsm82ezo91A2-A9zHIq76oLd4,2086
|
|
3
|
+
llm_router/controller.py,sha256=5rSAP2pTzFWApZp0_oB7y7RWGTpKkuiYCgC7gkWjoCs,3390
|
|
4
|
+
llm_router/embedder.py,sha256=9g1V7toS7c8H7Jvmx5ho0kZsnzlPXuyYV5xo1Ao4qKo,1305
|
|
5
|
+
llm_router/escalation.py,sha256=S22Lt3bUei3QjOKd1470Hpn6CujAiu46UcOI6K_-m6c,8445
|
|
6
|
+
llm_router/logging_utils.py,sha256=G9d-aEFGYlaKDYNyiuklMwAu8MUyOXFvDOCAzNKekYk,984
|
|
7
|
+
llm_router/prompts.py,sha256=tC01-zazA0wcclb8eAS0I7JPMCm0WZs55jIHyKl8QG4,1643
|
|
8
|
+
llm_router/routing_table.py,sha256=x5X0sGMn-oOaPncTXswqCGIJFd0aGiNRvTHXVnjkxeA,3667
|
|
9
|
+
route67-0.1.0.dist-info/METADATA,sha256=27TSDzfH3YjRE8e7lQxZ1uCLZWikph3Op52K2vxM_10,6343
|
|
10
|
+
route67-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
route67-0.1.0.dist-info/licenses/LICENSE,sha256=QaM-505zGS0RtXBxfg_YtfN-J3YndVxG7ruuzAGs2v4,1077
|
|
12
|
+
route67-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 route67 contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|