claude-model-router 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- claude_model_router-0.2.0/LICENSE +21 -0
- claude_model_router-0.2.0/PKG-INFO +201 -0
- claude_model_router-0.2.0/README.md +174 -0
- claude_model_router-0.2.0/claude_model_router.egg-info/PKG-INFO +201 -0
- claude_model_router-0.2.0/claude_model_router.egg-info/SOURCES.txt +13 -0
- claude_model_router-0.2.0/claude_model_router.egg-info/dependency_links.txt +1 -0
- claude_model_router-0.2.0/claude_model_router.egg-info/requires.txt +4 -0
- claude_model_router-0.2.0/claude_model_router.egg-info/top_level.txt +1 -0
- claude_model_router-0.2.0/model_router/__init__.py +25 -0
- claude_model_router-0.2.0/model_router/classifier.py +123 -0
- claude_model_router-0.2.0/model_router/router.py +97 -0
- claude_model_router-0.2.0/pyproject.toml +39 -0
- claude_model_router-0.2.0/setup.cfg +4 -0
- claude_model_router-0.2.0/tests/test_classifier.py +94 -0
- claude_model_router-0.2.0/tests/test_router.py +127 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Juno Seong
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: claude-model-router
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Cost-aware model routing for the Claude API — reference implementation with a Claude Code subagent hook
|
|
5
|
+
Author: Juno Seong
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/junoseong/claude-model-router
|
|
8
|
+
Project-URL: Repository, https://github.com/junoseong/claude-model-router
|
|
9
|
+
Project-URL: Issues, https://github.com/junoseong/claude-model-router/issues
|
|
10
|
+
Keywords: claude,anthropic,llm,model-routing,cost-optimization
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: anthropic>=0.115
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# claude-model-router
|
|
29
|
+
|
|
30
|
+
Reference implementation of cost-aware model routing for the Claude API —
|
|
31
|
+
plus a Claude Code hook that stops subagents from silently inheriting your
|
|
32
|
+
most expensive model.
|
|
33
|
+
|
|
34
|
+
A cheap Haiku 4.5 call classifies each prompt's complexity; the prompt then
|
|
35
|
+
executes on the cheapest capable tier. Small, tested, and built around the
|
|
36
|
+
parts naive routers get wrong.
|
|
37
|
+
|
|
38
|
+
## Why naive Claude routers break
|
|
39
|
+
|
|
40
|
+
Most routing examples treat model IDs as interchangeable strings. On the
|
|
41
|
+
Claude 5 family they are not:
|
|
42
|
+
|
|
43
|
+
| Trap | What actually happens | Where handled |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| Refusals are not exceptions | Fable 5 refusals return **HTTP 200** with `stop_reason: "refusal"` — a try/except fallback never fires | server-side fallback beta + explicit `stop_reason` check ([router.py](model_router/router.py)) |
|
|
46
|
+
| `thinking` differs per model | Fable 5: always on, explicit config → 400. Opus 4.8: omitted = **off**. Sonnet 5: omitted = adaptive. Haiku 4.5: adaptive → 400 | per-model request shape in `build_request()` |
|
|
47
|
+
| `effort` is not universal | `output_config: {"effort": ...}` → 400 on Haiku 4.5 | effort omitted on the trivial tier |
|
|
48
|
+
| `content[0].text` crashes | refusals ship empty `content`; thinking blocks precede text blocks | block-type-filtered extraction |
|
|
49
|
+
| Small `max_tokens` + high effort = truncation | thinking tokens spend from `max_tokens`; xhigh turns run minutes | 64K `max_tokens` + streaming everywhere |
|
|
50
|
+
| Cache dies on model switch | prompt caches are **model-scoped**; per-turn routing invalidates the cache every turn | route once per conversation, reuse via `execute()` |
|
|
51
|
+
| Context windows differ | Haiku 4.5 = 200K; every other tier model = 1M | trivial bumped to low above 180K context tokens |
|
|
52
|
+
|
|
53
|
+
## Tiers
|
|
54
|
+
|
|
55
|
+
| Tier | Model | Effort | Notes |
|
|
56
|
+
|------|-------|--------|-------|
|
|
57
|
+
| trivial | `claude-haiku-4-5` | — | reformatting, extraction, classification |
|
|
58
|
+
| low | `claude-sonnet-5` | medium | summaries, single-function edits, simple Q&A |
|
|
59
|
+
| mid | `claude-opus-4-8` | high | debugging, security review, single-file refactors |
|
|
60
|
+
| high | `claude-fable-5` | xhigh | multi-file refactors, migrations, architecture |
|
|
61
|
+
|
|
62
|
+
The keyword-heuristic fallback (used when the classifier call fails) never
|
|
63
|
+
assigns trivial — misrouting real work to Haiku costs more in retries than
|
|
64
|
+
the tier saves, so only the classifier may pick it.
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import anthropic
|
|
70
|
+
from model_router import run, execute
|
|
71
|
+
|
|
72
|
+
client = anthropic.Anthropic()
|
|
73
|
+
|
|
74
|
+
result = run(client, "Refactor the auth module across services")
|
|
75
|
+
print(result.route.model, result.route.source) # claude-fable-5 classifier
|
|
76
|
+
print(result.served_by) # may be opus-4-8 after a fallback
|
|
77
|
+
print(result.text)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Multi-turn: prompt caches are model-scoped, so route once per conversation
|
|
81
|
+
and reuse the route for follow-ups —
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
route = result.route
|
|
85
|
+
result2 = execute(client, route, full_message_history)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Cost model
|
|
89
|
+
|
|
90
|
+
1,000 prompts, mixed workload (30% trivial / 40% low / 20% mid / 10% high),
|
|
91
|
+
list pricing, output token counts held equal across models (conservative —
|
|
92
|
+
Fable at xhigh thinks longer than smaller models on the same prompt):
|
|
93
|
+
|
|
94
|
+
| Tier | Prompts | Routed model | Routed cost | All-Fable cost |
|
|
95
|
+
|---|---|---|---|---|
|
|
96
|
+
| trivial | 300 | `claude-haiku-4-5` | $0.54 | $5.40 |
|
|
97
|
+
| low | 400 | `claude-sonnet-5` | $6.60 | $22.00 |
|
|
98
|
+
| mid | 200 | `claude-opus-4-8` | $18.50 | $37.00 |
|
|
99
|
+
| high | 100 | `claude-fable-5` | $80.00 | $80.00 |
|
|
100
|
+
| classifier overhead | 1000 | `claude-haiku-4-5` | $0.72 | — |
|
|
101
|
+
| **total** | | | **$106.36** | **$144.40** |
|
|
102
|
+
|
|
103
|
+
**26% saved** on this mix — and the mix is the whole story: the 10% of
|
|
104
|
+
prompts that legitimately need Fable dominate the bill. A support-bot-shaped
|
|
105
|
+
workload (mostly trivial/low) saves 70%+. Rerun with your own shape:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
python benchmarks/cost_model.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
This is a pricing calculation, not a live benchmark — it spends no tokens
|
|
112
|
+
and its assumptions are at the top of the script.
|
|
113
|
+
|
|
114
|
+
### Live benchmark (real tokens, real dollars)
|
|
115
|
+
|
|
116
|
+
[`benchmarks/live_bench.py`](benchmarks/live_bench.py) runs a 17-prompt
|
|
117
|
+
mixed workload through both arms — routed vs. all-Fable — with **real API
|
|
118
|
+
calls**, and prices each call from the `usage` block the API actually
|
|
119
|
+
returned (by the model that actually served it, which matters after a
|
|
120
|
+
fallback). Classifier overhead is metered from real Haiku usage, not
|
|
121
|
+
estimated.
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
ANTHROPIC_API_KEY=... python benchmarks/live_bench.py --tiers trivial,low # cheap smoke run
|
|
125
|
+
ANTHROPIC_API_KEY=... python benchmarks/live_bench.py --out live_results.md # full run, ~$2-6
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
It spends real money (the all-Fable arm at xhigh dominates), refuses to
|
|
129
|
+
start without a key, and flags every prompt where the classifier's tier
|
|
130
|
+
disagreed with the expected tier — so you can audit misroutes instead of
|
|
131
|
+
trusting a single savings number. `RoutedResult.usage` exposes the same
|
|
132
|
+
real token counts in your own code.
|
|
133
|
+
|
|
134
|
+
This is the honest version of the standard methodology: the same
|
|
135
|
+
cost-quality comparison RouterBench and RouteLLM run offline, at a scale
|
|
136
|
+
one person can afford to reproduce.
|
|
137
|
+
|
|
138
|
+
## Claude Code hook: stop subagents from burning Fable tokens
|
|
139
|
+
|
|
140
|
+
Separate deliverable, same philosophy. In Claude Code, **subagents inherit
|
|
141
|
+
the session model by default** — run your session on an expensive model and
|
|
142
|
+
every spawned agent (including greps and file listings) bills at that rate.
|
|
143
|
+
|
|
144
|
+
[`hooks/subagent-router.py`](hooks/subagent-router.py) is a PreToolUse hook
|
|
145
|
+
that denies any Agent spawn missing an explicit `model` param. The deny
|
|
146
|
+
reason carries a routing table, so the session model immediately re-issues
|
|
147
|
+
the spawn with the cheapest capable tier. Self-correcting, one file, no
|
|
148
|
+
dependencies.
|
|
149
|
+
|
|
150
|
+
Install:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
cp hooks/subagent-router.py ~/.claude/hooks/
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Then add to `~/.claude/settings.json`:
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
"hooks": {
|
|
160
|
+
"PreToolUse": [
|
|
161
|
+
{
|
|
162
|
+
"matcher": "Agent",
|
|
163
|
+
"hooks": [
|
|
164
|
+
{
|
|
165
|
+
"type": "command",
|
|
166
|
+
"command": "python3 ~/.claude/hooks/subagent-router.py"
|
|
167
|
+
}
|
|
168
|
+
]
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
New sessions then enforce: unrouted spawn → denied with policy → re-spawned
|
|
175
|
+
with explicit `model` (haiku for greps/locates, sonnet for single-file work
|
|
176
|
+
and reviews, opus for multi-file/hard debugging, session model only for
|
|
177
|
+
judgment-critical synthesis).
|
|
178
|
+
|
|
179
|
+
## Install
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pip install claude-model-router
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Or from source:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/junoseong/claude-model-router
|
|
189
|
+
cd claude-model-router
|
|
190
|
+
python3 -m venv .venv && .venv/bin/pip install -e '.[dev]'
|
|
191
|
+
.venv/bin/pytest
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Notes
|
|
195
|
+
|
|
196
|
+
- Fable 5 requires 30-day data retention; zero-data-retention orgs get 400s
|
|
197
|
+
on the high tier — remap `high` to `claude-opus-4-8` in `_TIER_ROUTES`.
|
|
198
|
+
- Sonnet 5 has intro pricing ($2/$10 per MTok) through 2026-08-31; the cost
|
|
199
|
+
model uses sticker prices ($3/$15).
|
|
200
|
+
|
|
201
|
+
MIT licensed.
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# claude-model-router
|
|
2
|
+
|
|
3
|
+
Reference implementation of cost-aware model routing for the Claude API —
|
|
4
|
+
plus a Claude Code hook that stops subagents from silently inheriting your
|
|
5
|
+
most expensive model.
|
|
6
|
+
|
|
7
|
+
A cheap Haiku 4.5 call classifies each prompt's complexity; the prompt then
|
|
8
|
+
executes on the cheapest capable tier. Small, tested, and built around the
|
|
9
|
+
parts naive routers get wrong.
|
|
10
|
+
|
|
11
|
+
## Why naive Claude routers break
|
|
12
|
+
|
|
13
|
+
Most routing examples treat model IDs as interchangeable strings. On the
|
|
14
|
+
Claude 5 family they are not:
|
|
15
|
+
|
|
16
|
+
| Trap | What actually happens | Where handled |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| Refusals are not exceptions | Fable 5 refusals return **HTTP 200** with `stop_reason: "refusal"` — a try/except fallback never fires | server-side fallback beta + explicit `stop_reason` check ([router.py](model_router/router.py)) |
|
|
19
|
+
| `thinking` differs per model | Fable 5: always on, explicit config → 400. Opus 4.8: omitted = **off**. Sonnet 5: omitted = adaptive. Haiku 4.5: adaptive → 400 | per-model request shape in `build_request()` |
|
|
20
|
+
| `effort` is not universal | `output_config: {"effort": ...}` → 400 on Haiku 4.5 | effort omitted on the trivial tier |
|
|
21
|
+
| `content[0].text` crashes | refusals ship empty `content`; thinking blocks precede text blocks | block-type-filtered extraction |
|
|
22
|
+
| Small `max_tokens` + high effort = truncation | thinking tokens spend from `max_tokens`; xhigh turns run minutes | 64K `max_tokens` + streaming everywhere |
|
|
23
|
+
| Cache dies on model switch | prompt caches are **model-scoped**; per-turn routing invalidates the cache every turn | route once per conversation, reuse via `execute()` |
|
|
24
|
+
| Context windows differ | Haiku 4.5 = 200K; every other tier model = 1M | trivial bumped to low above 180K context tokens |
|
|
25
|
+
|
|
26
|
+
## Tiers
|
|
27
|
+
|
|
28
|
+
| Tier | Model | Effort | Notes |
|
|
29
|
+
|------|-------|--------|-------|
|
|
30
|
+
| trivial | `claude-haiku-4-5` | — | reformatting, extraction, classification |
|
|
31
|
+
| low | `claude-sonnet-5` | medium | summaries, single-function edits, simple Q&A |
|
|
32
|
+
| mid | `claude-opus-4-8` | high | debugging, security review, single-file refactors |
|
|
33
|
+
| high | `claude-fable-5` | xhigh | multi-file refactors, migrations, architecture |
|
|
34
|
+
|
|
35
|
+
The keyword-heuristic fallback (used when the classifier call fails) never
|
|
36
|
+
assigns trivial — misrouting real work to Haiku costs more in retries than
|
|
37
|
+
the tier saves, so only the classifier may pick it.
|
|
38
|
+
|
|
39
|
+
## Usage
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
import anthropic
|
|
43
|
+
from model_router import run, execute
|
|
44
|
+
|
|
45
|
+
client = anthropic.Anthropic()
|
|
46
|
+
|
|
47
|
+
result = run(client, "Refactor the auth module across services")
|
|
48
|
+
print(result.route.model, result.route.source) # claude-fable-5 classifier
|
|
49
|
+
print(result.served_by) # may be opus-4-8 after a fallback
|
|
50
|
+
print(result.text)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Multi-turn: prompt caches are model-scoped, so route once per conversation
|
|
54
|
+
and reuse the route for follow-ups —
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
route = result.route
|
|
58
|
+
result2 = execute(client, route, full_message_history)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Cost model
|
|
62
|
+
|
|
63
|
+
1,000 prompts, mixed workload (30% trivial / 40% low / 20% mid / 10% high),
|
|
64
|
+
list pricing, output token counts held equal across models (conservative —
|
|
65
|
+
Fable at xhigh thinks longer than smaller models on the same prompt):
|
|
66
|
+
|
|
67
|
+
| Tier | Prompts | Routed model | Routed cost | All-Fable cost |
|
|
68
|
+
|---|---|---|---|---|
|
|
69
|
+
| trivial | 300 | `claude-haiku-4-5` | $0.54 | $5.40 |
|
|
70
|
+
| low | 400 | `claude-sonnet-5` | $6.60 | $22.00 |
|
|
71
|
+
| mid | 200 | `claude-opus-4-8` | $18.50 | $37.00 |
|
|
72
|
+
| high | 100 | `claude-fable-5` | $80.00 | $80.00 |
|
|
73
|
+
| classifier overhead | 1000 | `claude-haiku-4-5` | $0.72 | — |
|
|
74
|
+
| **total** | | | **$106.36** | **$144.40** |
|
|
75
|
+
|
|
76
|
+
**26% saved** on this mix — and the mix is the whole story: the 10% of
|
|
77
|
+
prompts that legitimately need Fable dominate the bill. A support-bot-shaped
|
|
78
|
+
workload (mostly trivial/low) saves 70%+. Rerun with your own shape:
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
python benchmarks/cost_model.py
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
This is a pricing calculation, not a live benchmark — it spends no tokens
|
|
85
|
+
and its assumptions are at the top of the script.
|
|
86
|
+
|
|
87
|
+
### Live benchmark (real tokens, real dollars)
|
|
88
|
+
|
|
89
|
+
[`benchmarks/live_bench.py`](benchmarks/live_bench.py) runs a 17-prompt
|
|
90
|
+
mixed workload through both arms — routed vs. all-Fable — with **real API
|
|
91
|
+
calls**, and prices each call from the `usage` block the API actually
|
|
92
|
+
returned (by the model that actually served it, which matters after a
|
|
93
|
+
fallback). Classifier overhead is metered from real Haiku usage, not
|
|
94
|
+
estimated.
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
ANTHROPIC_API_KEY=... python benchmarks/live_bench.py --tiers trivial,low # cheap smoke run
|
|
98
|
+
ANTHROPIC_API_KEY=... python benchmarks/live_bench.py --out live_results.md # full run, ~$2-6
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
It spends real money (the all-Fable arm at xhigh dominates), refuses to
|
|
102
|
+
start without a key, and flags every prompt where the classifier's tier
|
|
103
|
+
disagreed with the expected tier — so you can audit misroutes instead of
|
|
104
|
+
trusting a single savings number. `RoutedResult.usage` exposes the same
|
|
105
|
+
real token counts in your own code.
|
|
106
|
+
|
|
107
|
+
This is the honest version of the standard methodology: the same
|
|
108
|
+
cost-quality comparison RouterBench and RouteLLM run offline, at a scale
|
|
109
|
+
one person can afford to reproduce.
|
|
110
|
+
|
|
111
|
+
## Claude Code hook: stop subagents from burning Fable tokens
|
|
112
|
+
|
|
113
|
+
Separate deliverable, same philosophy. In Claude Code, **subagents inherit
|
|
114
|
+
the session model by default** — run your session on an expensive model and
|
|
115
|
+
every spawned agent (including greps and file listings) bills at that rate.
|
|
116
|
+
|
|
117
|
+
[`hooks/subagent-router.py`](hooks/subagent-router.py) is a PreToolUse hook
|
|
118
|
+
that denies any Agent spawn missing an explicit `model` param. The deny
|
|
119
|
+
reason carries a routing table, so the session model immediately re-issues
|
|
120
|
+
the spawn with the cheapest capable tier. Self-correcting, one file, no
|
|
121
|
+
dependencies.
|
|
122
|
+
|
|
123
|
+
Install:
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
cp hooks/subagent-router.py ~/.claude/hooks/
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Then add to `~/.claude/settings.json`:
|
|
130
|
+
|
|
131
|
+
```json
|
|
132
|
+
"hooks": {
|
|
133
|
+
"PreToolUse": [
|
|
134
|
+
{
|
|
135
|
+
"matcher": "Agent",
|
|
136
|
+
"hooks": [
|
|
137
|
+
{
|
|
138
|
+
"type": "command",
|
|
139
|
+
"command": "python3 ~/.claude/hooks/subagent-router.py"
|
|
140
|
+
}
|
|
141
|
+
]
|
|
142
|
+
}
|
|
143
|
+
]
|
|
144
|
+
}
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
New sessions then enforce: unrouted spawn → denied with policy → re-spawned
|
|
148
|
+
with explicit `model` (haiku for greps/locates, sonnet for single-file work
|
|
149
|
+
and reviews, opus for multi-file/hard debugging, session model only for
|
|
150
|
+
judgment-critical synthesis).
|
|
151
|
+
|
|
152
|
+
## Install
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
pip install claude-model-router
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
Or from source:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
git clone https://github.com/junoseong/claude-model-router
|
|
162
|
+
cd claude-model-router
|
|
163
|
+
python3 -m venv .venv && .venv/bin/pip install -e '.[dev]'
|
|
164
|
+
.venv/bin/pytest
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Notes
|
|
168
|
+
|
|
169
|
+
- Fable 5 requires 30-day data retention; zero-data-retention orgs get 400s
|
|
170
|
+
on the high tier — remap `high` to `claude-opus-4-8` in `_TIER_ROUTES`.
|
|
171
|
+
- Sonnet 5 has intro pricing ($2/$10 per MTok) through 2026-08-31; the cost
|
|
172
|
+
model uses sticker prices ($3/$15).
|
|
173
|
+
|
|
174
|
+
MIT licensed.
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: claude-model-router
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Cost-aware model routing for the Claude API — reference implementation with a Claude Code subagent hook
|
|
5
|
+
Author: Juno Seong
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/junoseong/claude-model-router
|
|
8
|
+
Project-URL: Repository, https://github.com/junoseong/claude-model-router
|
|
9
|
+
Project-URL: Issues, https://github.com/junoseong/claude-model-router/issues
|
|
10
|
+
Keywords: claude,anthropic,llm,model-routing,cost-optimization
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: anthropic>=0.115
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# claude-model-router
|
|
29
|
+
|
|
30
|
+
Reference implementation of cost-aware model routing for the Claude API —
|
|
31
|
+
plus a Claude Code hook that stops subagents from silently inheriting your
|
|
32
|
+
most expensive model.
|
|
33
|
+
|
|
34
|
+
A cheap Haiku 4.5 call classifies each prompt's complexity; the prompt then
|
|
35
|
+
executes on the cheapest capable tier. Small, tested, and built around the
|
|
36
|
+
parts naive routers get wrong.
|
|
37
|
+
|
|
38
|
+
## Why naive Claude routers break
|
|
39
|
+
|
|
40
|
+
Most routing examples treat model IDs as interchangeable strings. On the
|
|
41
|
+
Claude 5 family they are not:
|
|
42
|
+
|
|
43
|
+
| Trap | What actually happens | Where handled |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| Refusals are not exceptions | Fable 5 refusals return **HTTP 200** with `stop_reason: "refusal"` — a try/except fallback never fires | server-side fallback beta + explicit `stop_reason` check ([router.py](model_router/router.py)) |
|
|
46
|
+
| `thinking` differs per model | Fable 5: always on, explicit config → 400. Opus 4.8: omitted = **off**. Sonnet 5: omitted = adaptive. Haiku 4.5: adaptive → 400 | per-model request shape in `build_request()` |
|
|
47
|
+
| `effort` is not universal | `output_config: {"effort": ...}` → 400 on Haiku 4.5 | effort omitted on the trivial tier |
|
|
48
|
+
| `content[0].text` crashes | refusals ship empty `content`; thinking blocks precede text blocks | block-type-filtered extraction |
|
|
49
|
+
| Small `max_tokens` + high effort = truncation | thinking tokens spend from `max_tokens`; xhigh turns run minutes | 64K `max_tokens` + streaming everywhere |
|
|
50
|
+
| Cache dies on model switch | prompt caches are **model-scoped**; per-turn routing invalidates the cache every turn | route once per conversation, reuse via `execute()` |
|
|
51
|
+
| Context windows differ | Haiku 4.5 = 200K; every other tier model = 1M | trivial bumped to low above 180K context tokens |
|
|
52
|
+
|
|
53
|
+
## Tiers
|
|
54
|
+
|
|
55
|
+
| Tier | Model | Effort | Notes |
|
|
56
|
+
|------|-------|--------|-------|
|
|
57
|
+
| trivial | `claude-haiku-4-5` | — | reformatting, extraction, classification |
|
|
58
|
+
| low | `claude-sonnet-5` | medium | summaries, single-function edits, simple Q&A |
|
|
59
|
+
| mid | `claude-opus-4-8` | high | debugging, security review, single-file refactors |
|
|
60
|
+
| high | `claude-fable-5` | xhigh | multi-file refactors, migrations, architecture |
|
|
61
|
+
|
|
62
|
+
The keyword-heuristic fallback (used when the classifier call fails) never
|
|
63
|
+
assigns trivial — misrouting real work to Haiku costs more in retries than
|
|
64
|
+
the tier saves, so only the classifier may pick it.
|
|
65
|
+
|
|
66
|
+
## Usage
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import anthropic
|
|
70
|
+
from model_router import run, execute
|
|
71
|
+
|
|
72
|
+
client = anthropic.Anthropic()
|
|
73
|
+
|
|
74
|
+
result = run(client, "Refactor the auth module across services")
|
|
75
|
+
print(result.route.model, result.route.source) # claude-fable-5 classifier
|
|
76
|
+
print(result.served_by) # may be opus-4-8 after a fallback
|
|
77
|
+
print(result.text)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Multi-turn: prompt caches are model-scoped, so route once per conversation
|
|
81
|
+
and reuse the route for follow-ups —
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
route = result.route
|
|
85
|
+
result2 = execute(client, route, full_message_history)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Cost model
|
|
89
|
+
|
|
90
|
+
1,000 prompts, mixed workload (30% trivial / 40% low / 20% mid / 10% high),
|
|
91
|
+
list pricing, output token counts held equal across models (conservative —
|
|
92
|
+
Fable at xhigh thinks longer than smaller models on the same prompt):
|
|
93
|
+
|
|
94
|
+
| Tier | Prompts | Routed model | Routed cost | All-Fable cost |
|
|
95
|
+
|---|---|---|---|---|
|
|
96
|
+
| trivial | 300 | `claude-haiku-4-5` | $0.54 | $5.40 |
|
|
97
|
+
| low | 400 | `claude-sonnet-5` | $6.60 | $22.00 |
|
|
98
|
+
| mid | 200 | `claude-opus-4-8` | $18.50 | $37.00 |
|
|
99
|
+
| high | 100 | `claude-fable-5` | $80.00 | $80.00 |
|
|
100
|
+
| classifier overhead | 1000 | `claude-haiku-4-5` | $0.72 | — |
|
|
101
|
+
| **total** | | | **$106.36** | **$144.40** |
|
|
102
|
+
|
|
103
|
+
**26% saved** on this mix — and the mix is the whole story: the 10% of
|
|
104
|
+
prompts that legitimately need Fable dominate the bill. A support-bot-shaped
|
|
105
|
+
workload (mostly trivial/low) saves 70%+. Rerun with your own shape:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
python benchmarks/cost_model.py
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
This is a pricing calculation, not a live benchmark — it spends no tokens
|
|
112
|
+
and its assumptions are at the top of the script.
|
|
113
|
+
|
|
114
|
+
### Live benchmark (real tokens, real dollars)
|
|
115
|
+
|
|
116
|
+
[`benchmarks/live_bench.py`](benchmarks/live_bench.py) runs a 17-prompt
|
|
117
|
+
mixed workload through both arms — routed vs. all-Fable — with **real API
|
|
118
|
+
calls**, and prices each call from the `usage` block the API actually
|
|
119
|
+
returned (by the model that actually served it, which matters after a
|
|
120
|
+
fallback). Classifier overhead is metered from real Haiku usage, not
|
|
121
|
+
estimated.
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
ANTHROPIC_API_KEY=... python benchmarks/live_bench.py --tiers trivial,low # cheap smoke run
|
|
125
|
+
ANTHROPIC_API_KEY=... python benchmarks/live_bench.py --out live_results.md # full run, ~$2-6
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
It spends real money (the all-Fable arm at xhigh dominates), refuses to
|
|
129
|
+
start without a key, and flags every prompt where the classifier's tier
|
|
130
|
+
disagreed with the expected tier — so you can audit misroutes instead of
|
|
131
|
+
trusting a single savings number. `RoutedResult.usage` exposes the same
|
|
132
|
+
real token counts in your own code.
|
|
133
|
+
|
|
134
|
+
This is the honest version of the standard methodology: the same
|
|
135
|
+
cost-quality comparison RouterBench and RouteLLM run offline, at a scale
|
|
136
|
+
one person can afford to reproduce.
|
|
137
|
+
|
|
138
|
+
## Claude Code hook: stop subagents from burning Fable tokens
|
|
139
|
+
|
|
140
|
+
Separate deliverable, same philosophy. In Claude Code, **subagents inherit
|
|
141
|
+
the session model by default** — run your session on an expensive model and
|
|
142
|
+
every spawned agent (including greps and file listings) bills at that rate.
|
|
143
|
+
|
|
144
|
+
[`hooks/subagent-router.py`](hooks/subagent-router.py) is a PreToolUse hook
|
|
145
|
+
that denies any Agent spawn missing an explicit `model` param. The deny
|
|
146
|
+
reason carries a routing table, so the session model immediately re-issues
|
|
147
|
+
the spawn with the cheapest capable tier. Self-correcting, one file, no
|
|
148
|
+
dependencies.
|
|
149
|
+
|
|
150
|
+
Install:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
cp hooks/subagent-router.py ~/.claude/hooks/
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Then add to `~/.claude/settings.json`:
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
"hooks": {
|
|
160
|
+
"PreToolUse": [
|
|
161
|
+
{
|
|
162
|
+
"matcher": "Agent",
|
|
163
|
+
"hooks": [
|
|
164
|
+
{
|
|
165
|
+
"type": "command",
|
|
166
|
+
"command": "python3 ~/.claude/hooks/subagent-router.py"
|
|
167
|
+
}
|
|
168
|
+
]
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
New sessions then enforce: unrouted spawn → denied with policy → re-spawned
|
|
175
|
+
with explicit `model` (haiku for greps/locates, sonnet for single-file work
|
|
176
|
+
and reviews, opus for multi-file/hard debugging, session model only for
|
|
177
|
+
judgment-critical synthesis).
|
|
178
|
+
|
|
179
|
+
## Install
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pip install claude-model-router
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Or from source:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
git clone https://github.com/junoseong/claude-model-router
|
|
189
|
+
cd claude-model-router
|
|
190
|
+
python3 -m venv .venv && .venv/bin/pip install -e '.[dev]'
|
|
191
|
+
.venv/bin/pytest
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Notes
|
|
195
|
+
|
|
196
|
+
- Fable 5 requires 30-day data retention; zero-data-retention orgs get 400s
|
|
197
|
+
on the high tier — remap `high` to `claude-opus-4-8` in `_TIER_ROUTES`.
|
|
198
|
+
- Sonnet 5 has intro pricing ($2/$10 per MTok) through 2026-08-31; the cost
|
|
199
|
+
model uses sticker prices ($3/$15).
|
|
200
|
+
|
|
201
|
+
MIT licensed.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
claude_model_router.egg-info/PKG-INFO
|
|
5
|
+
claude_model_router.egg-info/SOURCES.txt
|
|
6
|
+
claude_model_router.egg-info/dependency_links.txt
|
|
7
|
+
claude_model_router.egg-info/requires.txt
|
|
8
|
+
claude_model_router.egg-info/top_level.txt
|
|
9
|
+
model_router/__init__.py
|
|
10
|
+
model_router/classifier.py
|
|
11
|
+
model_router/router.py
|
|
12
|
+
tests/test_classifier.py
|
|
13
|
+
tests/test_router.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
model_router
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Cost-aware model routing for the Claude API.
|
|
2
|
+
|
|
3
|
+
Classifies prompt complexity with a cheap Haiku call (keyword heuristic
|
|
4
|
+
as fallback) and executes on the cheapest capable model tier:
|
|
5
|
+
|
|
6
|
+
trivial -> claude-haiku-4-5 (no thinking; bumped to low if context > 180K)
|
|
7
|
+
low -> claude-sonnet-5 @ medium effort
|
|
8
|
+
mid -> claude-opus-4-8 @ high effort (adaptive thinking)
|
|
9
|
+
high -> claude-fable-5 @ xhigh effort (server-side fallback to Opus 4.8)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .classifier import Route, Tier, classify, heuristic_tier
|
|
13
|
+
from .router import RefusalError, RoutedResult, build_request, execute, run
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Route",
|
|
17
|
+
"Tier",
|
|
18
|
+
"classify",
|
|
19
|
+
"heuristic_tier",
|
|
20
|
+
"RefusalError",
|
|
21
|
+
"RoutedResult",
|
|
22
|
+
"build_request",
|
|
23
|
+
"execute",
|
|
24
|
+
"run",
|
|
25
|
+
]
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Prompt-complexity classification for model routing.
|
|
2
|
+
|
|
3
|
+
Two stages: a cheap Haiku 4.5 call for semantic tiering, with a
|
|
4
|
+
keyword/size heuristic as fallback when the API call fails or returns
|
|
5
|
+
something unparseable. The classifier call costs a few hundred tokens
|
|
6
|
+
at $1/$5 per MTok — noise next to the downstream call it's routing.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from typing import Literal
|
|
13
|
+
|
|
14
|
+
import anthropic
|
|
15
|
+
|
|
16
|
+
Tier = Literal["trivial", "low", "mid", "high"]
|
|
17
|
+
|
|
18
|
+
CLASSIFIER_MODEL = "claude-haiku-4-5"
|
|
19
|
+
|
|
20
|
+
# Haiku 4.5 has a 200K context window (every other tier model has 1M);
|
|
21
|
+
# trivial-tier work that arrives with more context than this gets bumped
|
|
22
|
+
# to the low tier instead. Margin left for output and system prompt.
|
|
23
|
+
_HAIKU_CONTEXT_LIMIT = 180_000
|
|
24
|
+
|
|
25
|
+
# Complexity is judged from the head of the prompt plus size metadata;
|
|
26
|
+
# feeding the whole prompt to the classifier would defeat its purpose.
|
|
27
|
+
_CLASSIFY_SNIPPET_CHARS = 2000
|
|
28
|
+
|
|
29
|
+
_CLASSIFIER_SYSTEM = """\
|
|
30
|
+
You route coding prompts to one of four capability tiers. Reply with exactly one word.
|
|
31
|
+
|
|
32
|
+
trivial - reformatting, extraction, classification, lookups with obvious answers
|
|
33
|
+
low - summaries, single-function edits, boilerplate, simple Q&A
|
|
34
|
+
mid - debugging, security review, tricky edge cases, single-file refactors
|
|
35
|
+
high - multi-file refactors, migrations, architecture design, codebase-wide optimization
|
|
36
|
+
|
|
37
|
+
Reply with: trivial, low, mid, or high. Nothing else."""
|
|
38
|
+
|
|
39
|
+
_HIGH_KEYWORDS = (
|
|
40
|
+
"migration",
|
|
41
|
+
"architecture",
|
|
42
|
+
"multi-file",
|
|
43
|
+
"optimize codebase",
|
|
44
|
+
"refactor across",
|
|
45
|
+
)
|
|
46
|
+
_MID_KEYWORDS = (
|
|
47
|
+
"debug",
|
|
48
|
+
"security audit",
|
|
49
|
+
"edge case",
|
|
50
|
+
"race condition",
|
|
51
|
+
"fuzzing",
|
|
52
|
+
"regex",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# effort None = model predates output_config.effort; omit the param.
|
|
56
|
+
_TIER_ROUTES: dict[Tier, tuple[str, str | None]] = {
|
|
57
|
+
"trivial": ("claude-haiku-4-5", None),
|
|
58
|
+
"low": ("claude-sonnet-5", "medium"),
|
|
59
|
+
"mid": ("claude-opus-4-8", "high"),
|
|
60
|
+
"high": ("claude-fable-5", "xhigh"),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@dataclass(frozen=True)
|
|
65
|
+
class Route:
|
|
66
|
+
model: str
|
|
67
|
+
effort: str | None
|
|
68
|
+
tier: Tier
|
|
69
|
+
source: str # "classifier" | "heuristic" | "fixed" (caller-pinned route)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def heuristic_tier(prompt: str, context_tokens: int = 0) -> Tier:
|
|
73
|
+
"""Keyword/size fallback used when the classifier call fails.
|
|
74
|
+
|
|
75
|
+
Never returns "trivial": keywords can't reliably separate trivial
|
|
76
|
+
from low, and misrouting real work to Haiku costs more in retries
|
|
77
|
+
than the tier saves. Only the classifier may assign trivial.
|
|
78
|
+
"""
|
|
79
|
+
p = prompt.lower()
|
|
80
|
+
if any(k in p for k in _HIGH_KEYWORDS) or context_tokens > 400_000:
|
|
81
|
+
return "high"
|
|
82
|
+
if any(k in p for k in _MID_KEYWORDS) or context_tokens > 100_000:
|
|
83
|
+
return "mid"
|
|
84
|
+
return "low"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def classify(
|
|
88
|
+
client: anthropic.Anthropic, prompt: str, context_tokens: int = 0
|
|
89
|
+
) -> Route:
|
|
90
|
+
"""Classify a prompt into a route; never raises on classifier failure."""
|
|
91
|
+
tier: Tier | None = None
|
|
92
|
+
source = "classifier"
|
|
93
|
+
try:
|
|
94
|
+
response = client.messages.create(
|
|
95
|
+
model=CLASSIFIER_MODEL,
|
|
96
|
+
max_tokens=8,
|
|
97
|
+
system=_CLASSIFIER_SYSTEM,
|
|
98
|
+
messages=[
|
|
99
|
+
{
|
|
100
|
+
"role": "user",
|
|
101
|
+
"content": (
|
|
102
|
+
f"context_tokens={context_tokens}\n---\n"
|
|
103
|
+
f"{prompt[:_CLASSIFY_SNIPPET_CHARS]}"
|
|
104
|
+
),
|
|
105
|
+
}
|
|
106
|
+
],
|
|
107
|
+
)
|
|
108
|
+
text = (
|
|
109
|
+
next((b.text for b in response.content if b.type == "text"), "")
|
|
110
|
+
.strip()
|
|
111
|
+
.lower()
|
|
112
|
+
)
|
|
113
|
+
if text in _TIER_ROUTES:
|
|
114
|
+
tier = text # type: ignore[assignment]
|
|
115
|
+
except anthropic.AnthropicError:
|
|
116
|
+
pass
|
|
117
|
+
if tier is None:
|
|
118
|
+
tier = heuristic_tier(prompt, context_tokens)
|
|
119
|
+
source = "heuristic"
|
|
120
|
+
if tier == "trivial" and context_tokens > _HAIKU_CONTEXT_LIMIT:
|
|
121
|
+
tier = "low"
|
|
122
|
+
model, effort = _TIER_ROUTES[tier]
|
|
123
|
+
return Route(model=model, effort=effort, tier=tier, source=source)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Routed execution against the Messages API.
|
|
2
|
+
|
|
3
|
+
Handles the per-model API differences a naive router misses:
|
|
4
|
+
|
|
5
|
+
- Fable 5: thinking is always on (the param must be omitted), and
|
|
6
|
+
refusals arrive as HTTP 200 with stop_reason "refusal" — so requests
|
|
7
|
+
opt into the server-side fallback beta, which re-serves a declined
|
|
8
|
+
request on Opus 4.8 inside the same call.
|
|
9
|
+
- Opus 4.8: thinking is OFF when omitted, so it is requested
|
|
10
|
+
explicitly as adaptive.
|
|
11
|
+
- Sonnet 5: adaptive thinking is already the default when omitted;
|
|
12
|
+
sending it explicitly is harmless and keeps the code uniform.
|
|
13
|
+
- All calls stream: xhigh turns can run minutes, and the SDK rejects
|
|
14
|
+
non-streaming requests at this max_tokens.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Any
|
|
21
|
+
|
|
22
|
+
import anthropic
|
|
23
|
+
|
|
24
|
+
from .classifier import Route, classify
|
|
25
|
+
|
|
26
|
+
MAX_TOKENS = 64_000
|
|
27
|
+
FALLBACK_BETA = "server-side-fallback-2026-06-01"
|
|
28
|
+
FABLE_FALLBACK_MODEL = "claude-opus-4-8"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RefusalError(RuntimeError):
|
|
32
|
+
"""Raised when a request is declined and no fallback rescued it."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, stop_details: Any) -> None:
|
|
35
|
+
super().__init__(f"Request refused: {stop_details}")
|
|
36
|
+
self.stop_details = stop_details
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(frozen=True)
|
|
40
|
+
class RoutedResult:
|
|
41
|
+
text: str
|
|
42
|
+
route: Route
|
|
43
|
+
served_by: str # actual model that answered; differs from route.model after a fallback
|
|
44
|
+
stop_reason: str | None
|
|
45
|
+
usage: Any = None # response.usage — real token counts, for cost tracking
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def build_request(route: Route, messages: list[dict]) -> dict:
|
|
49
|
+
kwargs: dict = {
|
|
50
|
+
"model": route.model,
|
|
51
|
+
"max_tokens": MAX_TOKENS,
|
|
52
|
+
"messages": messages,
|
|
53
|
+
}
|
|
54
|
+
if route.effort is not None:
|
|
55
|
+
kwargs["output_config"] = {"effort": route.effort}
|
|
56
|
+
if route.model == "claude-fable-5":
|
|
57
|
+
kwargs["betas"] = [FALLBACK_BETA]
|
|
58
|
+
kwargs["fallbacks"] = [{"model": FABLE_FALLBACK_MODEL}]
|
|
59
|
+
elif route.model.startswith("claude-haiku"):
|
|
60
|
+
# Haiku 4.5 predates adaptive thinking and effort; sending either
|
|
61
|
+
# returns a 400. Thinking defaults off, which is right for trivial work.
|
|
62
|
+
pass
|
|
63
|
+
else:
|
|
64
|
+
kwargs["thinking"] = {"type": "adaptive"}
|
|
65
|
+
return kwargs
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def execute(
|
|
69
|
+
client: anthropic.Anthropic, route: Route, messages: list[dict]
|
|
70
|
+
) -> RoutedResult:
|
|
71
|
+
kwargs = build_request(route, messages)
|
|
72
|
+
api = client.beta.messages if "betas" in kwargs else client.messages
|
|
73
|
+
with api.stream(**kwargs) as stream:
|
|
74
|
+
response = stream.get_final_message()
|
|
75
|
+
if response.stop_reason == "refusal":
|
|
76
|
+
raise RefusalError(getattr(response, "stop_details", None))
|
|
77
|
+
text = "".join(b.text for b in response.content if b.type == "text")
|
|
78
|
+
return RoutedResult(
|
|
79
|
+
text=text,
|
|
80
|
+
route=route,
|
|
81
|
+
served_by=response.model,
|
|
82
|
+
stop_reason=response.stop_reason,
|
|
83
|
+
usage=getattr(response, "usage", None),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def run(
|
|
88
|
+
client: anthropic.Anthropic, prompt: str, context_tokens: int = 0
|
|
89
|
+
) -> RoutedResult:
|
|
90
|
+
"""Classify a prompt and execute it on the cheapest capable model.
|
|
91
|
+
|
|
92
|
+
Route once per conversation, not per turn: prompt caches are
|
|
93
|
+
model-scoped, so reuse the returned RoutedResult.route (via
|
|
94
|
+
execute()) for follow-up turns on the same conversation.
|
|
95
|
+
"""
|
|
96
|
+
route = classify(client, prompt, context_tokens)
|
|
97
|
+
return execute(client, route, [{"role": "user", "content": prompt}])
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "claude-model-router"
|
|
3
|
+
version = "0.2.0"
|
|
4
|
+
description = "Cost-aware model routing for the Claude API — reference implementation with a Claude Code subagent hook"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [{ name = "Juno Seong" }]
|
|
8
|
+
requires-python = ">=3.10"
|
|
9
|
+
dependencies = ["anthropic>=0.115"]
|
|
10
|
+
keywords = ["claude", "anthropic", "llm", "model-routing", "cost-optimization"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.10",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"Topic :: Software Development :: Libraries",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/junoseong/claude-model-router"
|
|
25
|
+
Repository = "https://github.com/junoseong/claude-model-router"
|
|
26
|
+
Issues = "https://github.com/junoseong/claude-model-router/issues"
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = ["pytest>=8"]
|
|
30
|
+
|
|
31
|
+
[tool.pytest.ini_options]
|
|
32
|
+
testpaths = ["tests"]
|
|
33
|
+
|
|
34
|
+
[build-system]
|
|
35
|
+
requires = ["setuptools>=68"]
|
|
36
|
+
build-backend = "setuptools.build_meta"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools]
|
|
39
|
+
packages = ["model_router"]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import anthropic
|
|
2
|
+
import pytest
|
|
3
|
+
|
|
4
|
+
from model_router.classifier import CLASSIFIER_MODEL, classify, heuristic_tier
|
|
5
|
+
from tests.conftest import FakeClient, FakeMessages, make_response, text_block, thinking_block
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestHeuristicTier:
|
|
9
|
+
def test_high_keyword(self):
|
|
10
|
+
assert heuristic_tier("plan the database migration to postgres") == "high"
|
|
11
|
+
|
|
12
|
+
def test_high_on_huge_context(self):
|
|
13
|
+
assert heuristic_tier("summarize this", context_tokens=500_000) == "high"
|
|
14
|
+
|
|
15
|
+
def test_mid_keyword(self):
|
|
16
|
+
assert heuristic_tier("debug this flaky test") == "mid"
|
|
17
|
+
|
|
18
|
+
def test_mid_on_large_context(self):
|
|
19
|
+
assert heuristic_tier("summarize this", context_tokens=150_000) == "mid"
|
|
20
|
+
|
|
21
|
+
def test_default_low(self):
|
|
22
|
+
assert heuristic_tier("what does this function return?") == "low"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TestTrivialTier:
|
|
26
|
+
def test_classifier_verdict_routes_to_haiku(self):
|
|
27
|
+
messages = FakeMessages(create_response=make_response([text_block("trivial")]))
|
|
28
|
+
route = classify(FakeClient(messages), "extract the date from this line")
|
|
29
|
+
assert (route.model, route.effort) == ("claude-haiku-4-5", None)
|
|
30
|
+
assert route.tier == "trivial"
|
|
31
|
+
|
|
32
|
+
def test_trivial_bumped_to_low_when_context_exceeds_haiku_window(self):
|
|
33
|
+
messages = FakeMessages(create_response=make_response([text_block("trivial")]))
|
|
34
|
+
route = classify(FakeClient(messages), "extract the date", context_tokens=190_000)
|
|
35
|
+
assert route.model == "claude-sonnet-5"
|
|
36
|
+
assert route.tier == "low"
|
|
37
|
+
|
|
38
|
+
def test_heuristic_never_assigns_trivial(self):
|
|
39
|
+
messages = FakeMessages(create_error=anthropic.AnthropicError("boom"))
|
|
40
|
+
route = classify(FakeClient(messages), "reformat this csv line")
|
|
41
|
+
assert route.source == "heuristic"
|
|
42
|
+
assert route.model == "claude-sonnet-5"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class TestClassify:
|
|
46
|
+
def test_uses_classifier_verdict(self):
|
|
47
|
+
messages = FakeMessages(create_response=make_response([text_block("high")]))
|
|
48
|
+
route = classify(FakeClient(messages), "restructure everything")
|
|
49
|
+
assert (route.model, route.effort) == ("claude-fable-5", "xhigh")
|
|
50
|
+
assert route.source == "classifier"
|
|
51
|
+
assert messages.create_calls[0]["model"] == CLASSIFIER_MODEL
|
|
52
|
+
|
|
53
|
+
def test_verdict_whitespace_and_case_tolerated(self):
|
|
54
|
+
messages = FakeMessages(create_response=make_response([text_block(" Mid \n")]))
|
|
55
|
+
route = classify(FakeClient(messages), "anything")
|
|
56
|
+
assert (route.model, route.effort) == ("claude-opus-4-8", "high")
|
|
57
|
+
|
|
58
|
+
def test_skips_non_text_blocks(self):
|
|
59
|
+
messages = FakeMessages(
|
|
60
|
+
create_response=make_response([thinking_block(), text_block("low")])
|
|
61
|
+
)
|
|
62
|
+
route = classify(FakeClient(messages), "anything")
|
|
63
|
+
assert route.model == "claude-sonnet-5"
|
|
64
|
+
|
|
65
|
+
def test_garbage_verdict_falls_back_to_heuristic(self):
|
|
66
|
+
messages = FakeMessages(
|
|
67
|
+
create_response=make_response([text_block("extremely hard")])
|
|
68
|
+
)
|
|
69
|
+
route = classify(FakeClient(messages), "debug this crash")
|
|
70
|
+
assert route.source == "heuristic"
|
|
71
|
+
assert route.model == "claude-opus-4-8"
|
|
72
|
+
|
|
73
|
+
def test_empty_content_falls_back_to_heuristic(self):
|
|
74
|
+
messages = FakeMessages(create_response=make_response([]))
|
|
75
|
+
route = classify(FakeClient(messages), "quick question")
|
|
76
|
+
assert route.source == "heuristic"
|
|
77
|
+
assert route.model == "claude-sonnet-5"
|
|
78
|
+
|
|
79
|
+
def test_api_error_falls_back_to_heuristic(self):
|
|
80
|
+
messages = FakeMessages(create_error=anthropic.AnthropicError("boom"))
|
|
81
|
+
route = classify(FakeClient(messages), "plan the migration")
|
|
82
|
+
assert route.source == "heuristic"
|
|
83
|
+
assert route.model == "claude-fable-5"
|
|
84
|
+
|
|
85
|
+
def test_non_anthropic_errors_propagate(self):
|
|
86
|
+
messages = FakeMessages(create_error=ValueError("bug in caller"))
|
|
87
|
+
with pytest.raises(ValueError):
|
|
88
|
+
classify(FakeClient(messages), "anything")
|
|
89
|
+
|
|
90
|
+
def test_long_prompt_truncated_for_classifier(self):
|
|
91
|
+
messages = FakeMessages(create_response=make_response([text_block("low")]))
|
|
92
|
+
classify(FakeClient(messages), "x" * 50_000)
|
|
93
|
+
sent = messages.create_calls[0]["messages"][0]["content"]
|
|
94
|
+
assert len(sent) < 3_000
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from model_router.classifier import Route
|
|
4
|
+
from model_router.router import (
|
|
5
|
+
FABLE_FALLBACK_MODEL,
|
|
6
|
+
FALLBACK_BETA,
|
|
7
|
+
MAX_TOKENS,
|
|
8
|
+
RefusalError,
|
|
9
|
+
build_request,
|
|
10
|
+
execute,
|
|
11
|
+
run,
|
|
12
|
+
)
|
|
13
|
+
from tests.conftest import FakeClient, FakeMessages, make_response, text_block, thinking_block
|
|
14
|
+
|
|
15
|
+
FABLE_ROUTE = Route(model="claude-fable-5", effort="xhigh", tier="high", source="classifier")
|
|
16
|
+
OPUS_ROUTE = Route(model="claude-opus-4-8", effort="high", tier="mid", source="classifier")
|
|
17
|
+
SONNET_ROUTE = Route(model="claude-sonnet-5", effort="medium", tier="low", source="classifier")
|
|
18
|
+
HAIKU_ROUTE = Route(model="claude-haiku-4-5", effort=None, tier="trivial", source="classifier")
|
|
19
|
+
|
|
20
|
+
MSGS = [{"role": "user", "content": "hello"}]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestBuildRequest:
|
|
24
|
+
def test_fable_gets_fallbacks_and_no_thinking(self):
|
|
25
|
+
kwargs = build_request(FABLE_ROUTE, MSGS)
|
|
26
|
+
assert kwargs["betas"] == [FALLBACK_BETA]
|
|
27
|
+
assert kwargs["fallbacks"] == [{"model": FABLE_FALLBACK_MODEL}]
|
|
28
|
+
assert "thinking" not in kwargs # explicit thinking config 400s on Fable
|
|
29
|
+
assert kwargs["output_config"] == {"effort": "xhigh"}
|
|
30
|
+
assert kwargs["max_tokens"] == MAX_TOKENS
|
|
31
|
+
|
|
32
|
+
def test_opus_gets_adaptive_thinking_and_no_fallbacks(self):
|
|
33
|
+
kwargs = build_request(OPUS_ROUTE, MSGS)
|
|
34
|
+
assert kwargs["thinking"] == {"type": "adaptive"}
|
|
35
|
+
assert "betas" not in kwargs
|
|
36
|
+
assert "fallbacks" not in kwargs
|
|
37
|
+
|
|
38
|
+
def test_sonnet_gets_adaptive_thinking(self):
|
|
39
|
+
kwargs = build_request(SONNET_ROUTE, MSGS)
|
|
40
|
+
assert kwargs["thinking"] == {"type": "adaptive"}
|
|
41
|
+
|
|
42
|
+
def test_haiku_gets_no_thinking_no_effort_no_fallbacks(self):
|
|
43
|
+
kwargs = build_request(HAIKU_ROUTE, MSGS)
|
|
44
|
+
assert "thinking" not in kwargs # adaptive thinking 400s on Haiku 4.5
|
|
45
|
+
assert "output_config" not in kwargs # effort param 400s on Haiku 4.5
|
|
46
|
+
assert "betas" not in kwargs
|
|
47
|
+
assert "fallbacks" not in kwargs
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TestExecute:
|
|
51
|
+
def test_fable_routes_through_beta_namespace(self):
|
|
52
|
+
beta = FakeMessages(
|
|
53
|
+
stream_response=make_response([text_block("answer")], model="claude-fable-5")
|
|
54
|
+
)
|
|
55
|
+
client = FakeClient(FakeMessages(), beta_messages=beta)
|
|
56
|
+
result = execute(client, FABLE_ROUTE, MSGS)
|
|
57
|
+
assert result.text == "answer"
|
|
58
|
+
assert len(beta.stream_calls) == 1
|
|
59
|
+
assert client.messages.stream_calls == []
|
|
60
|
+
|
|
61
|
+
def test_non_fable_uses_regular_namespace(self):
|
|
62
|
+
messages = FakeMessages(
|
|
63
|
+
stream_response=make_response([text_block("ok")], model="claude-opus-4-8")
|
|
64
|
+
)
|
|
65
|
+
client = FakeClient(messages)
|
|
66
|
+
result = execute(client, OPUS_ROUTE, MSGS)
|
|
67
|
+
assert result.text == "ok"
|
|
68
|
+
assert len(messages.stream_calls) == 1
|
|
69
|
+
|
|
70
|
+
def test_refusal_raises_with_details(self):
|
|
71
|
+
beta = FakeMessages(
|
|
72
|
+
stream_response=make_response(
|
|
73
|
+
[], stop_reason="refusal", stop_details={"category": "test"}
|
|
74
|
+
)
|
|
75
|
+
)
|
|
76
|
+
client = FakeClient(FakeMessages(), beta_messages=beta)
|
|
77
|
+
with pytest.raises(RefusalError) as exc:
|
|
78
|
+
execute(client, FABLE_ROUTE, MSGS)
|
|
79
|
+
assert exc.value.stop_details == {"category": "test"}
|
|
80
|
+
|
|
81
|
+
def test_text_extraction_skips_thinking_blocks(self):
|
|
82
|
+
messages = FakeMessages(
|
|
83
|
+
stream_response=make_response(
|
|
84
|
+
[thinking_block(), text_block("part1 "), text_block("part2")]
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
result = execute(FakeClient(messages), OPUS_ROUTE, MSGS)
|
|
88
|
+
assert result.text == "part1 part2"
|
|
89
|
+
|
|
90
|
+
def test_usage_passed_through_for_cost_tracking(self):
|
|
91
|
+
from types import SimpleNamespace
|
|
92
|
+
|
|
93
|
+
messages = FakeMessages(
|
|
94
|
+
stream_response=make_response(
|
|
95
|
+
[text_block("ok")],
|
|
96
|
+
usage=SimpleNamespace(input_tokens=123, output_tokens=456),
|
|
97
|
+
)
|
|
98
|
+
)
|
|
99
|
+
result = execute(FakeClient(messages), OPUS_ROUTE, MSGS)
|
|
100
|
+
assert result.usage.input_tokens == 123
|
|
101
|
+
assert result.usage.output_tokens == 456
|
|
102
|
+
|
|
103
|
+
def test_served_by_reports_fallback_model(self):
|
|
104
|
+
beta = FakeMessages(
|
|
105
|
+
stream_response=make_response(
|
|
106
|
+
[text_block("rescued")], model="claude-opus-4-8"
|
|
107
|
+
)
|
|
108
|
+
)
|
|
109
|
+
client = FakeClient(FakeMessages(), beta_messages=beta)
|
|
110
|
+
result = execute(client, FABLE_ROUTE, MSGS)
|
|
111
|
+
assert result.route.model == "claude-fable-5"
|
|
112
|
+
assert result.served_by == "claude-opus-4-8"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class TestRun:
|
|
116
|
+
def test_classifies_then_executes(self):
|
|
117
|
+
messages = FakeMessages(
|
|
118
|
+
create_response=make_response([text_block("mid")]),
|
|
119
|
+
stream_response=make_response([text_block("done")], model="claude-opus-4-8"),
|
|
120
|
+
)
|
|
121
|
+
client = FakeClient(messages)
|
|
122
|
+
result = run(client, "debug this")
|
|
123
|
+
assert result.route.model == "claude-opus-4-8"
|
|
124
|
+
assert result.text == "done"
|
|
125
|
+
assert messages.stream_calls[0]["messages"] == [
|
|
126
|
+
{"role": "user", "content": "debug this"}
|
|
127
|
+
]
|