shiftgate 0.1.8__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {shiftgate-0.1.8 → shiftgate-0.2.0}/PKG-INFO +77 -2
- {shiftgate-0.1.8 → shiftgate-0.2.0}/README.md +73 -1
- {shiftgate-0.1.8 → shiftgate-0.2.0}/pyproject.toml +4 -1
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/cli.py +113 -5
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/router/matcher.py +64 -28
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/router/router.py +8 -1
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/runtime/backend.py +168 -8
- shiftgate-0.2.0/shiftgate/serve/__init__.py +11 -0
- shiftgate-0.2.0/shiftgate/serve/app.py +271 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/utils/display.py +28 -2
- {shiftgate-0.1.8 → shiftgate-0.2.0}/tests/test_backend.py +95 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/tests/test_packaging.py +2 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/tests/test_router.py +88 -0
- shiftgate-0.2.0/tests/test_serve.py +208 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/.gitignore +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/data/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/data/default_tasks.json +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/feedback/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/feedback/loop.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/registry/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/registry/adapter_registry.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/registry/schemas.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/registry/task_registry.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/router/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/router/embedder.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/runtime/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/shiftgate/utils/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/tests/__init__.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/tests/test_feedback.py +0 -0
- {shiftgate-0.1.8 → shiftgate-0.2.0}/tests/test_registry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: shiftgate
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Intelligent routing layer that automatically selects the right LoRA adapter for each task in your local agent loop.
|
|
5
5
|
Project-URL: Homepage, https://github.com/shiftgate-ai/shiftgate
|
|
6
6
|
Project-URL: Repository, https://github.com/shiftgate-ai/shiftgate
|
|
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
20
20
|
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: fastapi>=0.110.0
|
|
21
22
|
Requires-Dist: fastembed>=0.3.0
|
|
22
23
|
Requires-Dist: httpx>=0.27.0
|
|
23
24
|
Requires-Dist: huggingface-hub>=0.22.0
|
|
@@ -25,7 +26,9 @@ Requires-Dist: numpy>=1.26.0
|
|
|
25
26
|
Requires-Dist: pydantic>=2.6.0
|
|
26
27
|
Requires-Dist: rich>=13.7.0
|
|
27
28
|
Requires-Dist: scikit-learn>=1.4.0
|
|
29
|
+
Requires-Dist: sse-starlette>=2.1.0
|
|
28
30
|
Requires-Dist: typer>=0.12.0
|
|
31
|
+
Requires-Dist: uvicorn[standard]>=0.29.0
|
|
29
32
|
Provides-Extra: dev
|
|
30
33
|
Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
|
|
31
34
|
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
@@ -109,7 +112,60 @@ shiftgate route "write a python sorting function"
|
|
|
109
112
|
shiftgate run "write a python sorting function"
|
|
110
113
|
```
|
|
111
114
|
|
|
112
|
-
**Essential commands:** `init` · `adapter add` · `route` · `run` · `doctor`
|
|
115
|
+
**Essential commands:** `init` · `adapter add` · `route` · `run` · `doctor` · `serve`
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Use as an OpenAI-compatible proxy
|
|
120
|
+
|
|
121
|
+
`shiftgate serve` exposes the router as a drop-in OpenAI endpoint. Any client that speaks OpenAI can point at it and get auto-routing for free — just pass `model="auto"`.
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
# Start the proxy (defaults to http://127.0.0.1:9000)
|
|
125
|
+
shiftgate serve
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
# Use it from any OpenAI client
|
|
130
|
+
from openai import OpenAI
|
|
131
|
+
|
|
132
|
+
client = OpenAI(base_url="http://localhost:9000/v1", api_key="not-needed")
|
|
133
|
+
client.chat.completions.create(
|
|
134
|
+
model="auto", # ← shiftgate picks the right adapter
|
|
135
|
+
messages=[{"role": "user", "content": "write a sql query"}],
|
|
136
|
+
)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
When `model="auto"`, shiftgate routes the request to the best adapter and rewrites `model` to that adapter's backend name before forwarding upstream. The response carries an `X-Shiftgate-Route: <adapter_id> (<score>)` header so you can see what was chosen. Passing any other model id bypasses routing and forwards verbatim. Streaming (`stream: true`) is piped straight through via SSE.
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
shiftgate serve --port 9000 --host 127.0.0.1 --backend auto # backend: auto | ollama | vllm | cerebras
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
> Bind defaults to `127.0.0.1` (localhost only). Pass `--host 0.0.0.0` to expose it on your network.
|
|
146
|
+
|
|
147
|
+
### Drop-in for Cursor / Aider / LangChain
|
|
148
|
+
|
|
149
|
+
Point each tool's OpenAI base URL at the proxy and use `model="auto"`:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
# Cursor → Settings → Models → Override OpenAI Base URL
|
|
153
|
+
http://localhost:9000/v1
|
|
154
|
+
|
|
155
|
+
# Aider
|
|
156
|
+
aider --openai-api-base http://localhost:9000/v1 --openai-api-key not-needed --model auto
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
# LangChain
|
|
161
|
+
from langchain_openai import ChatOpenAI
|
|
162
|
+
|
|
163
|
+
llm = ChatOpenAI(
|
|
164
|
+
base_url="http://localhost:9000/v1",
|
|
165
|
+
api_key="not-needed",
|
|
166
|
+
model="auto",
|
|
167
|
+
)
|
|
168
|
+
```
|
|
113
169
|
|
|
114
170
|
---
|
|
115
171
|
|
|
@@ -220,6 +276,10 @@ User query
|
|
|
220
276
|
└────────────────────────────────┘
|
|
221
277
|
```
|
|
222
278
|
|
|
279
|
+
### How routing works
|
|
280
|
+
|
|
281
|
+
When a backend is active, shiftgate filters candidate adapters to only those actually loaded on that backend. Switch from vLLM to Cerebras and shiftgate automatically picks Cerebras-compatible adapters — no re-registration needed. (When you run `shiftgate route` with no backend running, no filtering is applied, so you still see the full routing preview.)
|
|
282
|
+
|
|
223
283
|
---
|
|
224
284
|
|
|
225
285
|
## Bring Your Own Models
|
|
@@ -286,6 +346,20 @@ shiftgate adapter add sql-lora --local /models/sql-lora --tags sql --base llama3
|
|
|
286
346
|
|
|
287
347
|
Useful for exploring routing decisions before your backend is set up. To run inference, load the adapter in vLLM or Ollama and re-register with `--runtime`.
|
|
288
348
|
|
|
349
|
+
### Option 4 — Cerebras (cloud)
|
|
350
|
+
|
|
351
|
+
shiftgate also supports [Cerebras](https://cerebras.ai/) as a cloud fallback. It uses Cerebras' OpenAI-compatible API and authenticates with a bearer token from the `CEREBRAS_API_KEY` environment variable (or the `--cerebras-key` global flag).
|
|
352
|
+
|
|
353
|
+
```bash
|
|
354
|
+
export CEREBRAS_API_KEY=csk-...
|
|
355
|
+
shiftgate adapter add llama3.1-8b --runtime llama3.1-8b --tags general --base llama3.1
|
|
356
|
+
shiftgate run "write a python sorting function"
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
shiftgate auto-detects backends in the order **Ollama → vLLM → Cerebras**, so local backends always win and Cerebras is used only when no local backend is running.
|
|
360
|
+
|
|
361
|
+
> **Honest status:** shiftgate routes to Cerebras' base-model inference today. When Cerebras Multi-LoRA goes public, register your adapter with `--runtime <cerebras-lora-id>` and it just works — no shiftgate update needed.
|
|
362
|
+
|
|
289
363
|
---
|
|
290
364
|
|
|
291
365
|
## How to contribute adapters
|
|
@@ -393,6 +467,7 @@ shiftgate/
|
|
|
393
467
|
| `shiftgate route "<query>"` | Route a query and show the decision — no inference |
|
|
394
468
|
| `shiftgate route "<query>" --explain` | Full decision tree: task scores, candidates, selection reason |
|
|
395
469
|
| `shiftgate run "<query>"` | Route + run via Ollama or vLLM |
|
|
470
|
+
| `shiftgate serve [--port 9000] [--host …] [--backend …]` | Run an OpenAI-compatible auto-routing proxy |
|
|
396
471
|
| `shiftgate doctor` | Full health check: embedder, backend, adapters, task embeddings |
|
|
397
472
|
| `shiftgate adapter add <hf_repo> [--tags …] [--base …]` | Register adapter from HuggingFace (metadata only) |
|
|
398
473
|
| `shiftgate adapter add <id> --local <path> [--tags …]` | Register a local adapter path |
|
|
@@ -76,7 +76,60 @@ shiftgate route "write a python sorting function"
|
|
|
76
76
|
shiftgate run "write a python sorting function"
|
|
77
77
|
```
|
|
78
78
|
|
|
79
|
-
**Essential commands:** `init` · `adapter add` · `route` · `run` · `doctor`
|
|
79
|
+
**Essential commands:** `init` · `adapter add` · `route` · `run` · `doctor` · `serve`
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Use as an OpenAI-compatible proxy
|
|
84
|
+
|
|
85
|
+
`shiftgate serve` exposes the router as a drop-in OpenAI endpoint. Any client that speaks OpenAI can point at it and get auto-routing for free — just pass `model="auto"`.
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Start the proxy (defaults to http://127.0.0.1:9000)
|
|
89
|
+
shiftgate serve
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
# Use it from any OpenAI client
|
|
94
|
+
from openai import OpenAI
|
|
95
|
+
|
|
96
|
+
client = OpenAI(base_url="http://localhost:9000/v1", api_key="not-needed")
|
|
97
|
+
client.chat.completions.create(
|
|
98
|
+
model="auto", # ← shiftgate picks the right adapter
|
|
99
|
+
messages=[{"role": "user", "content": "write a sql query"}],
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
When `model="auto"`, shiftgate routes the request to the best adapter and rewrites `model` to that adapter's backend name before forwarding upstream. The response carries an `X-Shiftgate-Route: <adapter_id> (<score>)` header so you can see what was chosen. Passing any other model id bypasses routing and forwards verbatim. Streaming (`stream: true`) is piped straight through via SSE.
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
shiftgate serve --port 9000 --host 127.0.0.1 --backend auto # backend: auto | ollama | vllm | cerebras
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
> Bind defaults to `127.0.0.1` (localhost only). Pass `--host 0.0.0.0` to expose it on your network.
|
|
110
|
+
|
|
111
|
+
### Drop-in for Cursor / Aider / LangChain
|
|
112
|
+
|
|
113
|
+
Point each tool's OpenAI base URL at the proxy and use `model="auto"`:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Cursor → Settings → Models → Override OpenAI Base URL
|
|
117
|
+
http://localhost:9000/v1
|
|
118
|
+
|
|
119
|
+
# Aider
|
|
120
|
+
aider --openai-api-base http://localhost:9000/v1 --openai-api-key not-needed --model auto
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
# LangChain
|
|
125
|
+
from langchain_openai import ChatOpenAI
|
|
126
|
+
|
|
127
|
+
llm = ChatOpenAI(
|
|
128
|
+
base_url="http://localhost:9000/v1",
|
|
129
|
+
api_key="not-needed",
|
|
130
|
+
model="auto",
|
|
131
|
+
)
|
|
132
|
+
```
|
|
80
133
|
|
|
81
134
|
---
|
|
82
135
|
|
|
@@ -187,6 +240,10 @@ User query
|
|
|
187
240
|
└────────────────────────────────┘
|
|
188
241
|
```
|
|
189
242
|
|
|
243
|
+
### How routing works
|
|
244
|
+
|
|
245
|
+
When a backend is active, shiftgate filters candidate adapters to only those actually loaded on that backend. Switch from vLLM to Cerebras and shiftgate automatically picks Cerebras-compatible adapters — no re-registration needed. (When you run `shiftgate route` with no backend running, no filtering is applied, so you still see the full routing preview.)
|
|
246
|
+
|
|
190
247
|
---
|
|
191
248
|
|
|
192
249
|
## Bring Your Own Models
|
|
@@ -253,6 +310,20 @@ shiftgate adapter add sql-lora --local /models/sql-lora --tags sql --base llama3
|
|
|
253
310
|
|
|
254
311
|
Useful for exploring routing decisions before your backend is set up. To run inference, load the adapter in vLLM or Ollama and re-register with `--runtime`.
|
|
255
312
|
|
|
313
|
+
### Option 4 — Cerebras (cloud)
|
|
314
|
+
|
|
315
|
+
shiftgate also supports [Cerebras](https://cerebras.ai/) as a cloud fallback. It uses Cerebras' OpenAI-compatible API and authenticates with a bearer token from the `CEREBRAS_API_KEY` environment variable (or the `--cerebras-key` global flag).
|
|
316
|
+
|
|
317
|
+
```bash
|
|
318
|
+
export CEREBRAS_API_KEY=csk-...
|
|
319
|
+
shiftgate adapter add llama3.1-8b --runtime llama3.1-8b --tags general --base llama3.1
|
|
320
|
+
shiftgate run "write a python sorting function"
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
shiftgate auto-detects backends in the order **Ollama → vLLM → Cerebras**, so local backends always win and Cerebras is used only when no local backend is running.
|
|
324
|
+
|
|
325
|
+
> **Honest status:** shiftgate routes to Cerebras' base-model inference today. When Cerebras Multi-LoRA goes public, register your adapter with `--runtime <cerebras-lora-id>` and it just works — no shiftgate update needed.
|
|
326
|
+
|
|
256
327
|
---
|
|
257
328
|
|
|
258
329
|
## How to contribute adapters
|
|
@@ -360,6 +431,7 @@ shiftgate/
|
|
|
360
431
|
| `shiftgate route "<query>"` | Route a query and show the decision — no inference |
|
|
361
432
|
| `shiftgate route "<query>" --explain` | Full decision tree: task scores, candidates, selection reason |
|
|
362
433
|
| `shiftgate run "<query>"` | Route + run via Ollama or vLLM |
|
|
434
|
+
| `shiftgate serve [--port 9000] [--host …] [--backend …]` | Run an OpenAI-compatible auto-routing proxy |
|
|
363
435
|
| `shiftgate doctor` | Full health check: embedder, backend, adapters, task embeddings |
|
|
364
436
|
| `shiftgate adapter add <hf_repo> [--tags …] [--base …]` | Register adapter from HuggingFace (metadata only) |
|
|
365
437
|
| `shiftgate adapter add <id> --local <path> [--tags …]` | Register a local adapter path |
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "shiftgate"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "Intelligent routing layer that automatically selects the right LoRA adapter for each task in your local agent loop."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -31,6 +31,9 @@ dependencies = [
|
|
|
31
31
|
"scikit-learn>=1.4.0",
|
|
32
32
|
"httpx>=0.27.0",
|
|
33
33
|
"huggingface-hub>=0.22.0",
|
|
34
|
+
"fastapi>=0.110.0",
|
|
35
|
+
"uvicorn[standard]>=0.29.0",
|
|
36
|
+
"sse-starlette>=2.1.0",
|
|
34
37
|
]
|
|
35
38
|
|
|
36
39
|
[project.optional-dependencies]
|
|
@@ -17,6 +17,7 @@ from typing import Annotated, Optional
|
|
|
17
17
|
|
|
18
18
|
import typer
|
|
19
19
|
from rich.console import Console
|
|
20
|
+
from rich.panel import Panel
|
|
20
21
|
from rich.prompt import Confirm, Prompt
|
|
21
22
|
|
|
22
23
|
from shiftgate.registry.schemas import AdapterEntry, TaskCluster
|
|
@@ -39,6 +40,26 @@ app.add_typer(task_app, name="task")
|
|
|
39
40
|
app.add_typer(feedback_app, name="feedback")
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
@app.callback()
|
|
44
|
+
def _main(
|
|
45
|
+
cerebras_key: Annotated[
|
|
46
|
+
Optional[str],
|
|
47
|
+
typer.Option(
|
|
48
|
+
"--cerebras-key",
|
|
49
|
+
help=(
|
|
50
|
+
"Cerebras API key. If passed, sets CEREBRAS_API_KEY for this run "
|
|
51
|
+
"so the Cerebras cloud backend becomes available."
|
|
52
|
+
),
|
|
53
|
+
),
|
|
54
|
+
] = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Global options applied before any command runs."""
|
|
57
|
+
if cerebras_key:
|
|
58
|
+
import os
|
|
59
|
+
|
|
60
|
+
os.environ["CEREBRAS_API_KEY"] = cerebras_key
|
|
61
|
+
|
|
62
|
+
|
|
42
63
|
# ---------------------------------------------------------------------------
|
|
43
64
|
# Helpers
|
|
44
65
|
# ---------------------------------------------------------------------------
|
|
@@ -62,6 +83,19 @@ def _get_embedder():
|
|
|
62
83
|
return Embedder()
|
|
63
84
|
|
|
64
85
|
|
|
86
|
+
def _active_runtimes(backend_router) -> set[str] | None:
|
|
87
|
+
"""Return the set of runtime names loaded on the active backend, or None.
|
|
88
|
+
|
|
89
|
+
``None`` means no backend is active → the router should not filter
|
|
90
|
+
(preview behaviour). An empty set means a backend is active but reports no
|
|
91
|
+
loaded models.
|
|
92
|
+
"""
|
|
93
|
+
active = backend_router.active_backend
|
|
94
|
+
if active is None:
|
|
95
|
+
return None
|
|
96
|
+
return set(active.list_loaded_adapters())
|
|
97
|
+
|
|
98
|
+
|
|
65
99
|
def _auto_link_adapter(adapter: AdapterEntry, task_reg) -> list[str]:
|
|
66
100
|
"""Add ``adapter.id`` to the ``preferred_adapters`` of matching task clusters.
|
|
67
101
|
|
|
@@ -443,6 +477,7 @@ def route(
|
|
|
443
477
|
"""
|
|
444
478
|
from shiftgate.feedback import loop as feedback_loop
|
|
445
479
|
from shiftgate.router import router as routing
|
|
480
|
+
from shiftgate.runtime.backend import BackendRouter
|
|
446
481
|
from shiftgate.utils.display import show_explain_decision, show_routing_decision
|
|
447
482
|
|
|
448
483
|
task_reg, adapter_reg = _load_registries()
|
|
@@ -453,8 +488,15 @@ def route(
|
|
|
453
488
|
|
|
454
489
|
embedder = _get_embedder()
|
|
455
490
|
|
|
491
|
+
backend_router = BackendRouter()
|
|
492
|
+
backend_name = backend_router.detect()
|
|
493
|
+
available_runtimes = _active_runtimes(backend_router)
|
|
494
|
+
|
|
456
495
|
try:
|
|
457
|
-
trace, match_result = routing.route(
|
|
496
|
+
trace, match_result = routing.route(
|
|
497
|
+
query, task_reg, adapter_reg, embedder,
|
|
498
|
+
top_k=top_k, available_runtimes=available_runtimes,
|
|
499
|
+
)
|
|
458
500
|
except Exception as exc:
|
|
459
501
|
console.print(f"[red]Routing error:[/red] {exc}")
|
|
460
502
|
raise typer.Exit(1)
|
|
@@ -466,7 +508,9 @@ def route(
|
|
|
466
508
|
trace,
|
|
467
509
|
adapter=adapter,
|
|
468
510
|
task_name=task.name if task else None,
|
|
469
|
-
backend_name=
|
|
511
|
+
backend_name=backend_name,
|
|
512
|
+
loaded_runtimes=available_runtimes,
|
|
513
|
+
selection_method=match_result.selection_method,
|
|
470
514
|
)
|
|
471
515
|
|
|
472
516
|
if explain:
|
|
@@ -503,22 +547,29 @@ def run(
|
|
|
503
547
|
|
|
504
548
|
embedder = _get_embedder()
|
|
505
549
|
|
|
550
|
+
backend_router = BackendRouter()
|
|
551
|
+
backend_name = backend_router.detect()
|
|
552
|
+
available_runtimes = _active_runtimes(backend_router)
|
|
553
|
+
|
|
506
554
|
try:
|
|
507
|
-
trace, match_result = routing.route(
|
|
555
|
+
trace, match_result = routing.route(
|
|
556
|
+
query, task_reg, adapter_reg, embedder,
|
|
557
|
+
top_k=top_k, available_runtimes=available_runtimes,
|
|
558
|
+
)
|
|
508
559
|
except Exception as exc:
|
|
509
560
|
console.print(f"[red]Routing error:[/red] {exc}")
|
|
510
561
|
raise typer.Exit(1)
|
|
511
562
|
|
|
512
563
|
adapter = adapter_reg.get_adapter(trace.selected_adapter_id)
|
|
513
564
|
task = task_reg.get_task(trace.matched_task_id)
|
|
514
|
-
backend_router = BackendRouter()
|
|
515
|
-
backend_name = backend_router.detect()
|
|
516
565
|
|
|
517
566
|
show_routing_decision(
|
|
518
567
|
trace,
|
|
519
568
|
adapter=adapter,
|
|
520
569
|
task_name=task.name if task else None,
|
|
521
570
|
backend_name=backend_name,
|
|
571
|
+
loaded_runtimes=available_runtimes,
|
|
572
|
+
selection_method=match_result.selection_method,
|
|
522
573
|
)
|
|
523
574
|
|
|
524
575
|
if adapter is None:
|
|
@@ -705,6 +756,63 @@ def doctor() -> None:
|
|
|
705
756
|
)
|
|
706
757
|
|
|
707
758
|
|
|
759
|
+
# ---------------------------------------------------------------------------
|
|
760
|
+
# shiftgate serve
|
|
761
|
+
# ---------------------------------------------------------------------------
|
|
762
|
+
|
|
763
|
+
@app.command()
|
|
764
|
+
def serve(
|
|
765
|
+
port: Annotated[int, typer.Option("--port", help="Port to listen on.")] = 9000,
|
|
766
|
+
host: Annotated[
|
|
767
|
+
str,
|
|
768
|
+
typer.Option(
|
|
769
|
+
"--host",
|
|
770
|
+
help="Host to bind. Use 0.0.0.0 to expose on the network (default: localhost only).",
|
|
771
|
+
),
|
|
772
|
+
] = "127.0.0.1",
|
|
773
|
+
backend: Annotated[
|
|
774
|
+
str,
|
|
775
|
+
typer.Option(
|
|
776
|
+
"--backend",
|
|
777
|
+
help="Backend to forward to: auto | ollama | vllm | cerebras.",
|
|
778
|
+
),
|
|
779
|
+
] = "auto",
|
|
780
|
+
) -> None:
|
|
781
|
+
"""Run an OpenAI-compatible proxy that auto-routes `model="auto"` requests.
|
|
782
|
+
|
|
783
|
+
Point any OpenAI client (Cursor, Aider, LangChain, the OpenAI SDK) at this
|
|
784
|
+
URL and pass ``model="auto"`` to get shiftgate routing for free.
|
|
785
|
+
"""
|
|
786
|
+
import uvicorn
|
|
787
|
+
|
|
788
|
+
from shiftgate.serve import create_app
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
app_instance = create_app(backend=backend)
|
|
792
|
+
except Exception as exc:
|
|
793
|
+
console.print(f"[red]Failed to start serve:[/red] {exc}")
|
|
794
|
+
raise typer.Exit(1)
|
|
795
|
+
|
|
796
|
+
backend_router = app_instance.state.backend_router
|
|
797
|
+
backend_name = backend_router.active_backend_name or "none detected"
|
|
798
|
+
|
|
799
|
+
console.print(
|
|
800
|
+
Panel(
|
|
801
|
+
f"[bold green]shiftgate serve[/bold green] listening on "
|
|
802
|
+
f"[cyan]http://{host}:{port}[/cyan]\n"
|
|
803
|
+
f"backend: [bold]{backend_name}[/bold]\n\n"
|
|
804
|
+
"Point your OpenAI client at this URL with "
|
|
805
|
+
"[bold magenta]model='auto'[/bold magenta]:\n"
|
|
806
|
+
f" [dim]base_url=\"http://{host}:{port}/v1\"[/dim]",
|
|
807
|
+
title="OpenAI-compatible proxy",
|
|
808
|
+
border_style="green",
|
|
809
|
+
expand=False,
|
|
810
|
+
)
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
uvicorn.run(app_instance, host=host, port=port, log_level="info")
|
|
814
|
+
|
|
815
|
+
|
|
708
816
|
# ---------------------------------------------------------------------------
|
|
709
817
|
# shiftgate demo
|
|
710
818
|
# ---------------------------------------------------------------------------
|
|
@@ -117,6 +117,7 @@ def top_k_tasks(
|
|
|
117
117
|
def select_adapter(
|
|
118
118
|
top_tasks: list[TaskMatch],
|
|
119
119
|
adapter_registry, # AdapterRegistry — avoid circular import with string hint
|
|
120
|
+
available_runtimes: set[str] | None = None,
|
|
120
121
|
) -> MatchResult:
|
|
121
122
|
"""Select the adapter linked to the best-matching task.
|
|
122
123
|
|
|
@@ -125,15 +126,23 @@ def select_adapter(
|
|
|
125
126
|
For each top task (highest score first), walk ``preferred_adapters`` then
|
|
126
127
|
``fallback_adapters`` and collect the adapters that exist in the registry
|
|
127
128
|
(populating ``TaskMatch.candidate_adapters`` for the ``--explain`` view).
|
|
128
|
-
The first
|
|
129
|
+
The first viable adapter found, on the highest-scoring task, is selected.
|
|
130
|
+
|
|
131
|
+
Backend-aware filtering
|
|
132
|
+
-----------------------
|
|
133
|
+
When ``available_runtimes`` is provided (the set of model/adapter names
|
|
134
|
+
actually loaded on the active backend), only adapters whose
|
|
135
|
+
``effective_backend_name()`` is in that set are considered viable. If a
|
|
136
|
+
task's entire candidate list is filtered out, selection falls through to
|
|
137
|
+
the next-best task. When ``available_runtimes`` is ``None`` no filtering
|
|
138
|
+
happens (the preview behaviour used by ``shiftgate route``).
|
|
129
139
|
|
|
130
140
|
No silent fallback
|
|
131
141
|
------------------
|
|
132
142
|
If the matched (top) task has **no** linked adapter in the registry, the
|
|
133
143
|
router must NOT substitute an arbitrary adapter — doing so silently routes,
|
|
134
144
|
e.g., a music query to a SQL adapter and destroys trust. Instead this
|
|
135
|
-
returns a ``MatchResult`` with ``selected_adapter=None
|
|
136
|
-
``selection_method="no_adapter_for_task"``.
|
|
145
|
+
returns a ``MatchResult`` with ``selected_adapter=None``.
|
|
137
146
|
|
|
138
147
|
Parameters
|
|
139
148
|
----------
|
|
@@ -141,39 +150,56 @@ def select_adapter(
|
|
|
141
150
|
Output of ``top_k_tasks`` (sorted by score descending).
|
|
142
151
|
adapter_registry:
|
|
143
152
|
``AdapterRegistry`` instance to look up adapter IDs.
|
|
153
|
+
available_runtimes:
|
|
154
|
+
Optional set of runtime names loaded on the active backend. When set,
|
|
155
|
+
adapters not in the set are skipped during selection.
|
|
144
156
|
|
|
145
157
|
Returns
|
|
146
158
|
-------
|
|
147
|
-
``MatchResult``. ``selected_adapter`` is ``None`` when no adapter is
|
|
148
|
-
|
|
149
|
-
|
|
159
|
+
``MatchResult``. ``selected_adapter`` is ``None`` when no viable adapter is
|
|
160
|
+
found. ``selection_method`` is ``"no_adapter_on_active_backend"`` when
|
|
161
|
+
linked adapters exist but none are loaded on the active backend, otherwise
|
|
162
|
+
``"no_adapter_for_task"``. The ``matched_task`` is always the top-scoring
|
|
163
|
+
task so callers can still report what was matched.
|
|
150
164
|
"""
|
|
151
|
-
|
|
152
|
-
|
|
165
|
+
def _is_viable(adapter) -> bool:
|
|
166
|
+
if available_runtimes is None:
|
|
167
|
+
return True
|
|
168
|
+
return adapter.effective_backend_name() in available_runtimes
|
|
169
|
+
|
|
153
170
|
explicit_result: MatchResult | None = None
|
|
171
|
+
any_linked_adapter = False # any task had at least one registered adapter
|
|
154
172
|
|
|
155
173
|
for tm in top_tasks:
|
|
156
174
|
preferred_ids = list(tm.task.preferred_adapters)
|
|
157
175
|
fallback_ids = list(tm.task.fallback_adapters)
|
|
158
176
|
|
|
177
|
+
# Populate candidate_adapters with every registered adapter (for the
|
|
178
|
+
# --explain view, showing all candidates regardless of runtime).
|
|
159
179
|
for adapter_id in preferred_ids + fallback_ids:
|
|
160
180
|
adapter = adapter_registry.get_adapter(adapter_id)
|
|
161
181
|
if adapter is not None and adapter not in tm.candidate_adapters:
|
|
162
182
|
tm.candidate_adapters.append(adapter)
|
|
163
183
|
|
|
164
|
-
if
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
184
|
+
if tm.candidate_adapters:
|
|
185
|
+
any_linked_adapter = True
|
|
186
|
+
|
|
187
|
+
if explicit_result is None:
|
|
188
|
+
viable = [a for a in tm.candidate_adapters if _is_viable(a)]
|
|
189
|
+
if viable:
|
|
190
|
+
chosen = viable[0]
|
|
191
|
+
method = (
|
|
192
|
+
"preferred"
|
|
193
|
+
if chosen.id in tm.task.preferred_adapters
|
|
194
|
+
else "fallback"
|
|
195
|
+
)
|
|
196
|
+
explicit_result = MatchResult(
|
|
197
|
+
selected_adapter=chosen,
|
|
198
|
+
matched_task=tm.task,
|
|
199
|
+
similarity_score=tm.score,
|
|
200
|
+
all_task_matches=top_tasks,
|
|
201
|
+
selection_method=method,
|
|
202
|
+
)
|
|
177
203
|
|
|
178
204
|
if explicit_result is not None:
|
|
179
205
|
logger.debug(
|
|
@@ -185,19 +211,29 @@ def select_adapter(
|
|
|
185
211
|
)
|
|
186
212
|
return explicit_result
|
|
187
213
|
|
|
188
|
-
# No adapter
|
|
189
|
-
#
|
|
214
|
+
# No viable adapter across any ranked task. Distinguish "nothing linked at
|
|
215
|
+
# all" from "linked but not loaded on the active backend".
|
|
190
216
|
top_task = top_tasks[0]
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
217
|
+
if available_runtimes is not None and any_linked_adapter:
|
|
218
|
+
method = "no_adapter_on_active_backend"
|
|
219
|
+
logger.info(
|
|
220
|
+
"Linked adapter(s) for task '%s' exist but none are loaded on the "
|
|
221
|
+
"active backend — refusing to guess.",
|
|
222
|
+
top_task.task.id,
|
|
223
|
+
)
|
|
224
|
+
else:
|
|
225
|
+
method = "no_adapter_for_task"
|
|
226
|
+
logger.info(
|
|
227
|
+
"No linked adapter for matched task '%s' — refusing to guess.",
|
|
228
|
+
top_task.task.id,
|
|
229
|
+
)
|
|
230
|
+
|
|
195
231
|
return MatchResult(
|
|
196
232
|
selected_adapter=None,
|
|
197
233
|
matched_task=top_task.task,
|
|
198
234
|
similarity_score=top_task.score,
|
|
199
235
|
all_task_matches=top_tasks,
|
|
200
|
-
selection_method=
|
|
236
|
+
selection_method=method,
|
|
201
237
|
)
|
|
202
238
|
|
|
203
239
|
|
|
@@ -26,6 +26,7 @@ def route(
|
|
|
26
26
|
adapter_registry: AdapterRegistry,
|
|
27
27
|
embedder: Embedder,
|
|
28
28
|
top_k: int = 3,
|
|
29
|
+
available_runtimes: set[str] | None = None,
|
|
29
30
|
) -> tuple[RoutingTrace, MatchResult]:
|
|
30
31
|
"""Route a query string to the best matching adapter.
|
|
31
32
|
|
|
@@ -48,6 +49,12 @@ def route(
|
|
|
48
49
|
``Embedder`` instance (wraps fastembed singleton).
|
|
49
50
|
top_k:
|
|
50
51
|
Number of top task candidates to consider. Defaults to 3.
|
|
52
|
+
available_runtimes:
|
|
53
|
+
Optional set of runtime names loaded on the active backend. When set,
|
|
54
|
+
adapters whose ``effective_backend_name()`` is not in the set are
|
|
55
|
+
skipped, falling through to the next-best task. If no viable adapter
|
|
56
|
+
is found across all top-K tasks, the trace's ``selected_adapter_id`` is
|
|
57
|
+
``None`` and ``selection_method`` is ``"no_adapter_on_active_backend"``.
|
|
51
58
|
|
|
52
59
|
Returns
|
|
53
60
|
-------
|
|
@@ -73,7 +80,7 @@ def route(
|
|
|
73
80
|
query_embedding = embedder.embed(query)
|
|
74
81
|
all_tasks = task_registry.get_all_tasks()
|
|
75
82
|
ranked = top_k_tasks(query_embedding, all_tasks, k=top_k)
|
|
76
|
-
result = select_adapter(ranked, adapter_registry)
|
|
83
|
+
result = select_adapter(ranked, adapter_registry, available_runtimes=available_runtimes)
|
|
77
84
|
|
|
78
85
|
selected_id = result.selected_adapter.id if result.selected_adapter else None
|
|
79
86
|
|